Approximate Text Matching
An Introduction with Examples in Java |
Prof. David Bernstein |
Computer Science Department |
bernstdh@jmu.edu |
/** * A utility class that implements the Soundex algorithm * for finding strings that sound the same * * @author Prof. David Bernstein, James Madison University * @version 1.0 */ public class Soundex { /** * Construct a 4-character Soundex code from a String * * @param s The String to convert * @return The Soundex code */ public static String toCode(String s) { char c; char[] code = {'0','0','0','0'}; char[] chars, coded, noDups, noHW, noVowels, source; int i, index, n, start; // Create a character array chars = s.toUpperCase().toCharArray(); // Remove non alphabetic characters source = new char[chars.length]; index = 0; for (i=0; i<chars.length; i++) { if ((chars[i] < 'A') || (chars[i] > 'Z')) { // Remove } else { source[index] = chars[i]; index++; } } // Code the consonants coded = new char[index]; for (i=0; i<coded.length; i++) { c = source[i]; if ((c=='B') || (c=='P') || (c=='F') || (c=='V') ) coded[i]='1'; else if ((c=='C') || (c=='S') || (c=='K') || (c=='G') || (c=='J') || (c=='Q') || (c=='X') || (c=='Z') ) coded[i]='2'; else if ((c=='D') || (c=='T') ) coded[i]='3'; else if ((c=='L')) coded[i]='4'; else if ((c=='M') || (c=='N') ) coded[i]='5'; else if ((c=='R')) coded[i]='6'; else coded[i]=c; } // Remove H and W (except if the first letter) noHW = new char[coded.length]; noHW[0] = coded[0]; index = 1; for (i=1; i<coded.length; i++) { if ((coded[i] == 'H') || (coded[i] == 'W')) { // Remove } else { noHW[index] = coded[i]; index++; } } // Remove duplicates noDups = new char[index]; noDups[0] = noHW[0]; index = 1; for (i=1; i<noDups.length; i++) { if (noHW[i] == noDups[index-1]) { // Duplicate } else { noDups[index] = noHW[i]; index++; } } // Remove the vowels (except if the first letter) and non-alphabetic noVowels = new char[index]; noVowels[0] = noDups[0]; index = 1; for (i=1; i<noVowels.length; i++) { if ((noDups[i]=='A') || (noDups[i]=='E') || (noDups[i]=='I') || (noDups[i]=='O') || (noDups[i]=='U') || (noDups[i]=='Y') ) { // Remove } else { noVowels[index] = noDups[i]; index++; } } // Construct the final code code[0] = source[0]; n = code.length; if (index < n) n = index; for (i=1; i<n; i++) code[i] = noVowels[i]; return new String(code); } }