|
Approximate Text Matching
An Introduction with Examples in Java |
|
Prof. David Bernstein |
| Computer Science Department |
| bernstdh@jmu.edu |
/**
* A utility class that implements the Soundex algorithm
* for finding strings that sound the same
*
* @author Prof. David Bernstein, James Madison University
* @version 1.0
*/
public class Soundex
{
/**
* Construct a 4-character Soundex code from a String
*
* @param s The String to convert
* @return The Soundex code
*/
public static String toCode(String s)
{
char c;
char[] code = {'0','0','0','0'};
char[] chars, coded, noDups, noHW, noVowels, source;
int i, index, n, start;
// Create a character array
chars = s.toUpperCase().toCharArray();
// Remove non alphabetic characters
source = new char[chars.length];
index = 0;
for (i=0; i<chars.length; i++)
{
if ((chars[i] < 'A') || (chars[i] > 'Z'))
{
// Remove
}
else
{
source[index] = chars[i];
index++;
}
}
// Code the consonants
coded = new char[index];
for (i=0; i<coded.length; i++) {
c = source[i];
if ((c=='B') || (c=='P') || (c=='F') ||
(c=='V') ) coded[i]='1';
else if ((c=='C') || (c=='S') || (c=='K') ||
(c=='G') || (c=='J') || (c=='Q') ||
(c=='X') || (c=='Z') ) coded[i]='2';
else if ((c=='D') || (c=='T') ) coded[i]='3';
else if ((c=='L')) coded[i]='4';
else if ((c=='M') || (c=='N') ) coded[i]='5';
else if ((c=='R')) coded[i]='6';
else coded[i]=c;
}
// Remove H and W (except if the first letter)
noHW = new char[coded.length];
noHW[0] = coded[0];
index = 1;
for (i=1; i<coded.length; i++)
{
if ((coded[i] == 'H') || (coded[i] == 'W'))
{
// Remove
}
else
{
noHW[index] = coded[i];
index++;
}
}
// Remove duplicates
noDups = new char[index];
noDups[0] = noHW[0];
index = 1;
for (i=1; i<noDups.length; i++)
{
if (noHW[i] == noDups[index-1])
{
// Duplicate
}
else
{
noDups[index] = noHW[i];
index++;
}
}
// Remove the vowels (except if the first letter) and non-alphabetic
noVowels = new char[index];
noVowels[0] = noDups[0];
index = 1;
for (i=1; i<noVowels.length; i++) {
if ((noDups[i]=='A') || (noDups[i]=='E') || (noDups[i]=='I') ||
(noDups[i]=='O') || (noDups[i]=='U') || (noDups[i]=='Y') )
{
// Remove
}
else
{
noVowels[index] = noDups[i];
index++;
}
}
// Construct the final code
code[0] = source[0];
n = code.length;
if (index < n) n = index;
for (i=1; i<n; i++) code[i] = noVowels[i];
return new String(code);
}
}