La función Soundex es un algoritmo fonético para indexar nombres por medio del sonido de su pronunciación en Inglés, los códigos Soundex de diferentes cadenas de textos pueden ser comparadas para ver que tan similar son 2 cadenas de texto cuando son habladas.
El primer caracteres del código soundex es generado por el primer caracter de la expresión dada, convertida en mayuscula. Los siguientes caracteres del código son numeros que representan las letras en uan expresión. Las letras A,E, I, O, U, H, W y Y son ignoradas a no ser que estas sean la primera letra de la cadena de texto. Todos los caracteres internacionales fuera del rango A-Z son tratados como vocales. Por lo tanto, 2 cadenas que suenan casi igual deberían tener el mismo indice soundex. Por ejemplo, las palabras "texto" y "tixto" deberían producir un código soundex de T230.
En este artículo, encontrarás la implementación del conocido algoritmo en los siguiente lenguajes de programación:
Comencemos !
C
#include <stdio.h>
static char code[128] = { 0 };
const char* soundex(const char *s)
{
static char out[5];
int c, prev, i;
out[0] = out[4] = 0;
if (!s || !*s) return out;
out[0] = *s++;
/* La primera letra, aunque no sea forzada, puede afectar a la siguiente letra: Pfister */
prev = code[(int)out[0]];
for (i = 1; *s && i < 4; s++) {
if ((c = code[(int)*s]) == prev) continue;
if (c == -1) prev = 0; /* vowel as separator */
else if (c > 0) {
out[i++] = c + '0';
prev = c;
}
}
while (i < 4) out[i++] = '0';
return out;
}
void add_code(const char *s, int c)
{
while (*s) {
code[(int)*s] = code[0x20 ^ (int)*s] = c;
s++;
}
}
void init()
{
static const char *cls[] =
{ "AEIOU", "", "BFPV", "CGJKQSXZ", "DT", "L", "MN", "R", 0};
int i;
for (i = 0; cls[i]; i++)
add_code(cls[i], i - 1);
}
Uso
int main()
{
init();
/* J126 */
printf(soundex("Javascript"));
return 0;
}
C#
using System.Text.RegularExpressions;
public static class Soundex
{
public static string For(string word)
{
const int MaxSoundexCodeLength = 4;
var soundexCode = new StringBuilder();
var previousWasHOrW = false;
word = Regex.Replace(
word == null ? string.Empty : word.ToUpper(),
@"[^\w\s]",
string.Empty);
if (string.IsNullOrEmpty(word))
return string.Empty.PadRight(MaxSoundexCodeLength, '0');
soundexCode.Append(word.First());
for (var i = 1; i < word.Length; i++)
{
var numberCharForCurrentLetter =
GetCharNumberForLetter(word[i]);
if (i == 1 &&
numberCharForCurrentLetter ==
GetCharNumberForLetter(soundexCode[0]))
continue;
if (soundexCode.Length > 2 && previousWasHOrW &&
numberCharForCurrentLetter ==
soundexCode[soundexCode.Length - 2])
continue;
if (soundexCode.Length > 0 &&
numberCharForCurrentLetter ==
soundexCode[soundexCode.Length - 1])
continue;
soundexCode.Append(numberCharForCurrentLetter);
previousWasHOrW = "HW".Contains(word[i]);
}
return soundexCode
.Replace("0", string.Empty)
.ToString()
.PadRight(MaxSoundexCodeLength, '0')
.Substring(0, MaxSoundexCodeLength);
}
private static char GetCharNumberForLetter(char letter)
{
if ("BFPV".Contains(letter)) return '1';
if ("CGJKQSXZ".Contains(letter)) return '2';
if ("DT".Contains(letter)) return '3';
if ('L' == letter) return '4';
if ("MN".Contains(letter)) return '5';
if ('R' == letter) return '6';
return '0';
}
}
Uso
Soundex.For("CSharp Language") == Soundex.For("CSherp Language"); // True as C614 == C614
D
The D standard library (Phobos) contains already a soundex function.
import std.stdio: writeln;
import std.string: soundex;
void main() {
assert(soundex("soundex") == "S532");
assert(soundex("example") == "E251");
assert(soundex("ciondecks") == "C532");
assert(soundex("ekzampul") == "E251");
assert(soundex("Robert") == "R163");
assert(soundex("Rupert") == "R163");
assert(soundex("Rubin") == "R150");
assert(soundex("Ashcraft") == "A261");
assert(soundex("Ashcroft") == "A261");
assert(soundex("Tymczak") == "T522");
}
F#
let americanSoundex (x : string) =
let toString (xs : char list) = new System.String(xs |> Array.ofList)
let _americanSoundex =
let toUpper (x : string) = x.ToUpper()
let toArray (x : string) = x.ToCharArray()
let f1 ch =
match ch with
| 'H' | 'W' -> false
| _ -> true
let f2 ch =
match ch with
| 'B' | 'F' | 'P' | 'V' -> '1'
| 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' -> '2'
| 'D' | 'T' -> '3'
| 'L' -> '4'
| 'M' | 'N' -> '5'
| 'R' -> '6'
| _ -> ch
let rec f3 xs =
match xs with
| h0 :: h1 :: t -> h0 :: f3 (if (h0 = h1) then t else (h1 :: t))
| h :: _ -> [h]
| _ -> []
let f4 ch =
match ch with
| 'A' | 'E' | 'I' | 'O' | 'U' | 'Y' -> false
| _ -> true
let f5 ch first =
if ('0' <= ch && ch <= '9') then first
else ch
let f6 xs =
let len = List.length xs
seq{for i = 0 to 3 - len do yield '0'}
|> Seq.append (xs |> Seq.take (System.Math.Min(4, len)))
|> Seq.toList
let a = x |> toUpper |> toArray |> Array.toList
let b = a |> List.filter f1 //1
let c = b |> List.map f2 //2
let d = c |> f3 //3
let e = d |> List.tail |> List.filter f4 //4
let f = f5 (d |> List.head) (a |> List.head) :: e //5
f6 f //6
if (x.Length > 0) then toString(_americanSoundex)
else "0000"
["Robert"; "Rupert"; "Robbert"; "Rubin";
"Beer"; "Bear"; "Bearer";
"Smith"; "Smyth";
"Ashcraft"; "Ashcroft";
"Tymczak"; "Pfister"]
|> List.map (fun x -> (x, americanSoundex x)) |> List.iter (fun (x, y) -> printfn "%-8s = %s" x y)
(*
Robert = R163
Rupert = R163
Robbert = R163
Rubin = R150
Beer = B600
Bear = B600
Bearer = B660
Smith = S530
Smyth = S530
Ashcraft = A261
Ashcroft = A261
Tymczak = T522
Pfister = P236
*)
Go
package myPackageName
import (
"bytes"
"strings"
"fmt"
)
const codeLen = 4
var codes = map[string]string{
"a": "",
"b": "1",
"c": "2",
"d": "3",
"e": "",
"f": "1",
"g": "2",
"h": "",
"i": "",
"j": "2",
"k": "2",
"l": "4",
"m": "5",
"n": "5",
"o": "",
"p": "1",
"q": "2",
"r": "6",
"s": "2",
"t": "3",
"u": "",
"v": "1",
"w": "",
"x": "2",
"y": "",
"z": "2",
}
func Soundex(s string) string {
var encoded bytes.Buffer
encoded.WriteByte(s[0])
for i := 1; i < len(s); i++ {
if encoded.Len() == codeLen {
break
}
previous, current := strings.ToLower(string(s[i-1])), strings.ToLower(string(s[i]))
var next string
if i+1 < len(s) {
next = strings.ToLower(string(s[i+1]))
}
if (current == "h" || current == "w") && (codes[previous] == codes[next]) {
i = i + 1
continue
}
if c, ok := codes[current]; ok && len(c) > 0 {
encoded.WriteByte(c[0])
}
if codes[current] == codes[next] {
i = i + 1
continue
}
}
if encoded.Len() < codeLen {
padding := strings.Repeat("0", codeLen-encoded.Len())
encoded.WriteString(padding)
}
return strings.ToUpper(encoded.String())
}
Uso
func main() {
/* J126 */
fmt.Println(Soundex("Javascript"))
}
Java
private static String getCode(char c){
switch(c){
case 'B': case 'F': case 'P': case 'V':
return "1";
case 'C': case 'G': case 'J': case 'K':
case 'Q': case 'S': case 'X': case 'Z':
return "2";
case 'D': case 'T':
return "3";
case 'L':
return "4";
case 'M': case 'N':
return "5";
case 'R':
return "6";
default:
return "";
}
}
public static String soundex(String s){
String code, previous, soundex;
code = s.toUpperCase().charAt(0) + "";
previous = "7";
for(int i = 1;i < s.length();i++){
String current = getCode(s.toUpperCase().charAt(i));
if(current.length() > 0 && !current.equals(previous)){
code = code + current;
}
previous = current;
}
soundex = (code + "0000").substring(0, 4);
return soundex;
}
Uso
public static void main(String[] args){
System.out.println(soundex("Soundex"));//S532
System.out.println(soundex("Example"));//E251
System.out.println(soundex("Sownteks"));//S532
System.out.println(soundex("Ekzampul"));//E251
}
Javascript
var soundex = function(s) {
var a = s.toLowerCase().split(''),
f = a.shift(),
r = '',
codes = { a: '', e: '', i: '', o: '', u: '', b: 1, f: 1, p: 1, v: 1, c: 2, g: 2, j: 2, k: 2, q: 2, s: 2, x: 2, z: 2, d: 3, t: 3, l: 4, m: 5, n: 5, r: 6 };
r = f +
a
.map(function(v, i, a) {
return codes[v]
})
.filter(function(v, i, a) {
return ((i === 0) ? v !== codes[f] : v !== a[i - 1]);
})
.join('');
return (r + '000').slice(0, 4).toUpperCase();
};
Uso
soundex("Javascript") == soundex("Jabascript"); // True as J126 == J126
Objective-C
You can found the implementation of the Soundex algorithm Objective-C in this github gist , written by Darkseed.
PHP
PHP has already soundex as a built-in function that calculates the soundex key of a string.
Uso
soundex("PHP Server Language") == soundex("PHP Serber language"); // True as P100 == P100
Python
Function
def get_soundex(name):
"""Get the soundex code for the string"""
name = name.upper()
soundex = ""
soundex += name[0]
dictionary = {"BFPV": "1", "CGJKQSXZ":"2", "DT":"3", "L":"4", "MN":"5", "R":"6", "AEIOUHWY":"."}
for char in name[1:]:
for key in dictionary.keys():
if char in key:
code = dictionary[key]
if code != soundex[-1]:
soundex += code
soundex = soundex.replace(".", "")
soundex = soundex[:4].ljust(4, "0")
return soundex
Uso
list = ["Smith", "Smythe", "Robert", "Rupert", "Schultz", "Shultz"]
print("NAME\t\tSOUNDEX")
for name in list:
print("%s\t\t%s" % (name, get_soundex(name)))
Library
If you prefer to use a library, you can use the fuzzy package (which uses C Extensions (via Pyrex) for speed).
Ruby
class String
SoundexChars = 'BFPVCGJKQSXZDTLMNR'
SoundexNums = '111122222222334556'
SoundexCharsEx = '^' + SoundexChars
SoundexCharsDel = '^A-Z'
# desc: http://en.wikipedia.org/wiki/Soundex
def soundex(census = true)
str = self.upcase.delete(SoundexCharsDel)
str[0,1] + str[1..-1].delete(SoundexCharsEx).
tr_s(SoundexChars, SoundexNums)\
[0 .. (census ? 2 : -1)].
ljust(3, '0') rescue ''
end
def sounds_like(other)
self.soundex == other.soundex
end
end
Uso
%w(Soundex Sownteks Example Ekzampul foo bar).each_slice(2) do |word1, word2|
[word1, word2].each {|word| puts '%-8s -> %s' % [word, word.soundex]}
print "'#{word1}' "
print word1.sounds_like(word2) ? "sounds" : "does not sound"
print " like '#{word2}'\n"
end
#Soundex -> S532
#Sownteks -> S532
#'Soundex' sounds like 'Sownteks'
#Example -> E251
#Ekzampul -> E251
#'Example' sounds like 'Ekzampul'
#foo -> F000
#bar -> B600
#'foo' does not sound like 'bar'
Scala
def soundex(s:String)={
var code=s.head.toUpper.toString
var previous=getCode(code.head)
for(ch <- s.drop(1); current=getCode(ch.toUpper)){
if (!current.isEmpty && current!=previous)
code+=current
previous=current
}
code+="0000"
code.slice(0,4)
}
def getCode(c:Char)={
val code=Map("1"->List('B','F','P','V'),
"2"->List('C','G','J','K','Q','S','X','Z'),
"3"->List('D', 'T'),
"4"->List('L'),
"5"->List('M', 'N'),
"6"->List('R'))
code.find(_._2.exists(_==c)) match {
case Some((k,_)) => k
case _ => ""
}
}
Uso
def main(args: Array[String]): Unit = {
val tests=Map(
"Soundex" -> "S532",
"Euler" -> "E460",
"Gauss" -> "G200",
"Hilbert" -> "H416",
"Knuth" -> "K530",
"Lloyd" -> "L300",
"Lukasiewicz" -> "L222",
"Ellery" -> "E460",
"Ghosh" -> "G200",
"Heilbronn" -> "H416",
"Kant" -> "K530",
"Ladd" -> "L300",
"Lissajous" -> "L222",
"Wheaton" -> "W350",
"Ashcraft" -> "A226",
"Burroughs" -> "B622",
"Burrows" -> "B620",
"O'Hara" -> "O600")
tests.foreach{(v)=>
val code=soundex(v._1)
val status=if (code==v._2) "OK" else "ERROR"
printf("Name: %-20s Code: %s Found: %s - %s\n", v._1, v._2, code, status)
}
}
Swift
The class wrriten by clifford in this github repository is the implementation of the original Soundex algorithm in the Swift language.
//
// Soundex.swift
// speller
//
// Created by Clifford Helsel on 4/28/16.
//
// Based on standard Soundex algorithm and loosely ported from Apache Commons
// https://commons.apache.org/proper/commons-codec/apidocs/src-html/org/apache/commons/codec/language/Soundex.html
public class Soundex {
private static let en_mapping_string = Array("01230120022455012623010202".characters)
private static let en_alphabet = Array("ABCDEFGHIJKLMNOPQRSTUVWXYZ".characters)
private let mapping: [Character:Character] = Soundex.buildMapping(codes:en_alphabet,alphabet:en_mapping_string)
private static func buildMapping(codes: Array<Character>, alphabet: Array<Character>) -> [Character:Character] {
var retval: [Character:Character] = [:]
for (index,code) in codes.enumerated() {
retval[code] = alphabet[index]
}
return retval
}
private var soundexMapping: Array<Character> = Array(repeating:" ",count:4)
private func getMappingCode(s: String, index:Int) -> Character {
let i = s.index(s.startIndex, offsetBy: index)
let mappedChar = mapChar(c:s[i])
if (index>1 && !(mappedChar=="0"))
{
let j = s.index(s.startIndex,offsetBy:index-1)
let hwChar = s[j]
if (hwChar=="H" || hwChar=="W")
{
let k = s.index(s.startIndex,offsetBy:index-2)
let prehwChar = s[k]
let firstCode = mapChar(c:prehwChar)
if (firstCode==mappedChar || "H"==prehwChar || "W"==prehwChar) {
return "0"
}
}
}
return mappedChar
}
private func mapChar(c: Character) -> Character {
if let val = mapping[c] {
return val
}
return "0" // not specified in original Soundex specification, if character is not found, code is 0
}
public func soundex(of: String) -> String {
guard (of.characters.count>0) else {
return ""
}
let str=of.uppercased()
var out: Array<Character> = Array(" ".characters)
var last: Character = " "
var mapped: Character = " "
var incount=1
var count = 1
out[0]=str[str.startIndex]
last = getMappingCode(s:str, index: 0)
while (incount < str.characters.count && count < out.count) {
mapped = getMappingCode(s:str, index: incount)
incount += 1
if (mapped != "0") {
if (mapped != "0" && mapped != last) {
out[count]=mapped
count += 1
}
}
}
return String(out)
}
}
Uso
let c = Soundex()
c.soundex(of:"Christopher") // C631
VBScript
Function getCode(c)
Select Case c
Case "B", "F", "P", "V"
getCode = "1"
Case "C", "G", "J", "K", "Q", "S", "X", "Z"
getCode = "2"
Case "D", "T"
getCode = "3"
Case "L"
getCode = "4"
Case "M", "N"
getCode = "5"
Case "R"
getCode = "6"
End Select
End Function
Function soundex(s)
Dim code, previous
code = UCase(Mid(s, 1, 1))
previous = 7
For i = 2 to (Len(s) + 1)
current = getCode(UCase(Mid(s, i, 1)))
If Len(current) > 0 And current <> previous Then
code = code & current
End If
previous = current
Next
soundex = Mid(code, 1, 4)
If Len(code) < 4 Then
soundex = soundex & String(4 - Len(code), "0")
End If
End Function
En caso de que conozcas la implementación del algoritmo soundex en otro lenguaje de programación que no este en esta lista (o que este mejor implementado que aquí), por favor compartelo con la comunidad en la caja de comentarios. Que te diviertas !
Conviertete en un programador más sociable