JMU
Approximate Text Matching
An Introduction with Examples in Java


Prof. David Bernstein
James Madison University

Computer Science Department
bernstdh@jmu.edu


Motivation
The Soundex Algorithm
The Soundex Algorithm (cont.)
The Soundex Algorithm (cont.)

A Simple Implementation

javaexamples/strings/Soundex.java
        /**
 * A utility class that implements the Soundex algorithm
 * for finding strings that sound the same
 *
 * @author  Prof. David Bernstein, James Madison University
 * @version 1.0
 */
public class Soundex
{

    /**
     * Construct a 4-character Soundex code from a String
     *
     * @param s   The String to convert
     * @return    The Soundex code
     */
    public static String toCode(String s)
    {
        char       c;
        char[]     code = {'0','0','0','0'};
        char[]     chars, coded, noDups, noHW, noVowels, source;
        int        i, index, n, start;


        // Create a character array
        chars = s.toUpperCase().toCharArray();


        // Remove non alphabetic characters
        source = new char[chars.length];        
        index = 0;
        for (i=0; i<chars.length; i++)
        {
           if ((chars[i] < 'A') || (chars[i] > 'Z'))
           {
              // Remove
           }
           else
           {
              source[index] = chars[i];
              index++;
           }
        }
        

        // Code the consonants
        coded = new char[index];
        for (i=0; i<coded.length; i++) {

            c = source[i];
            if       ((c=='B') || (c=='P') || (c=='F') || 
                      (c=='V')                            )   coded[i]='1';

            else if  ((c=='C') || (c=='S') || (c=='K') || 
                      (c=='G') || (c=='J') || (c=='Q') || 
                      (c=='X') || (c=='Z')                )   coded[i]='2';

            else if  ((c=='D') || (c=='T') )                  coded[i]='3';

            else if  ((c=='L'))                               coded[i]='4';

            else if  ((c=='M') || (c=='N') )                  coded[i]='5';

            else if  ((c=='R'))                               coded[i]='6';
            else                                              coded[i]=c;
            
        }


        // Remove H and W  (except if the first letter)
        noHW    = new char[coded.length];
        noHW[0] = coded[0];
        index   = 1;
        for (i=1; i<coded.length; i++)
        {
            if ((coded[i] == 'H') || (coded[i] == 'W'))
            {
               // Remove
            }
            else
            {
                noHW[index] = coded[i];
                index++;
            }
        }
        

        // Remove duplicates
        noDups    = new char[index];
        noDups[0] = noHW[0];
        index     = 1;        
        for (i=1; i<noDups.length; i++)
        {
            if (noHW[i] == noDups[index-1])
            {
                // Duplicate
            }
            else
            {
                noDups[index] = noHW[i];
                index++;
            }
        }


        // Remove the vowels (except if the first letter) and non-alphabetic
        noVowels    = new char[index];
        noVowels[0] = noDups[0];
        index       = 1;
        for (i=1; i<noVowels.length; i++) {

           if ((noDups[i]=='A') || (noDups[i]=='E') || (noDups[i]=='I') ||
               (noDups[i]=='O') || (noDups[i]=='U') || (noDups[i]=='Y')      )
            {
               // Remove
            }
            else
            {
                noVowels[index] = noDups[i];
                index++;
            }
        }


        // Construct the final code
        code[0] = source[0];
        n = code.length;
        if (index < n) n = index;
        for (i=1; i<n; i++) code[i] = noVowels[i];




        return new String(code);
    }







}
        
Simple Phonetic Coding
Simple Phonetic Coding - A Home Grown Algorithm
Ignoring Small Differences
Number Matching
Number Matching (cont.)
Domain-Specific Situations