package com.fsenablers.fuzzy; import java.util.regex.*; import java.util.*; public class FuzzyCompare { private static Pattern s_participlesPattern = Pattern.compile( "\\+|&|AND | THE |^THE | OF "); private static Pattern s_abreviationsPattern = Pattern.compile( "INC[. ]|INC$| CO[. ]|CO$|CORP[. ]|CORP$|COMPANY|LLC[. ]|LLC$|LTD\\.|LTD$|DIV\\.|DIVISION" ); private static Pattern s_superfluousPattern = Pattern.compile( "[-.,'()#\\/& ]"); private static Map s_substitutions; static { s_substitutions = new HashMap(); s_substitutions.put( Pattern.compile( "WHLSLE[. ]|WHLSL[. ]|WHLSE[. ]|WHSLE[. ]" ), "WHOLESALE " ); s_substitutions.put( Pattern.compile( "BROS[. ]|BROS$" ), "BROTHERS " ); s_substitutions.put( Pattern.compile( "SPL[. ]|SPL$" ), "SUPPLY " ); s_substitutions.put( Pattern.compile( "SVCS[. ]|SVCS$" ), "SERVICES " ); s_substitutions.put( Pattern.compile( "SVC[. ]|SVC$" ), "SERVICE " ); s_substitutions.put( Pattern.compile( "AMER[. ]" ), "AMERICAN " ); s_substitutions.put( Pattern.compile( "FRZN[. ]" ), "FROZEN " ); s_substitutions.put( Pattern.compile( "FDSVC[. ]"), "FOODSERVICE " ); s_substitutions.put( Pattern.compile( "FT\\."), "FORT" ); s_substitutions.put( Pattern.compile( "SYSCO FOOD SERVICES"), "SYSCO" ); s_substitutions.put( Pattern.compile( "REINHART FOODSERVICE" ), "REINHART" ); s_substitutions.put( Pattern.compile( "US FOODSERVICE" ), "USFS" ); s_substitutions.put( Pattern.compile( "GORDON FOOD SERVICE" ), "GFS" ); } public int computeEditDistance( String a_source, String a_target ) { String s = prepareString( a_source ); String t = prepareString( a_target ); int d[][]; // matrix int n; // length of s int m; // length of t int i; // iterates through s int j; // iterates through t char s_i; // ith character of s char t_j; // jth character of t int cost; // cost // Step 1 n = s.length(); m = t.length(); if (n == 0) { return m; } if (m == 0) { return n; } // Step 2 d = new int[n+1][m+1]; for (i = 0; i <= n; i++) { d[i][0] = i; } for (j = 0; j <= m; j++) { d[0][j] = j; } // Step 3 for (i = 1; i <= n; i++) { s_i = s.charAt (i - 1); // Step 4 for (j = 1; j <= m; j++) { t_j = t.charAt (j - 1); // Step 5 if (s_i == t_j) { cost = 0; } else { cost = 1; } // Step 6 int left = d[i-1][j]+1; int right = d[i][j-1]+1; left = Math.min( left, right ); right = d[i-1][j-1] + cost; d[i][j] = Math.min(left, right); } } // Step 7 return d[n][m]; } public int computeEditDistance2( String a_source, String a_target ) { int maxOffset = 100; String s1 = prepareString( a_source ); String s2 = prepareString( a_target ); if ( s1.length() == 0 ) { if ( s2.length() == 0 ) { return 0; } return s2.length(); } if (s2.length() == 0) { return s1.length(); } int l1=s1.length(); int l2=s2.length(); char s1a[] = s1.toCharArray(); char s2a[] = s2.toCharArray(); int c1 = 0; //cursor for string 1 int c2 = 0; //cursor for string 2 int lcss = 0; //largest common subsequence int local_cs = 0; //local common substring while ((c1 < l1) && (c2 < l2)) { if (s1a[c1] == s2a[c2]) { local_cs++; } else { lcss+=local_cs; local_cs=0; if (c1!=c2) { c1=c2=Math.max(c1,c2); //using max to bypass the need for computer transpositions ('ab' vs 'ba') } for (int i = 0; i < maxOffset && (c1+i