package com.fsenablers.matching; import java.util.Map; import java.util.HashMap; import java.io.InputStreamReader; import java.io.BufferedReader; import java.io.IOException; import java.util.Collections; import java.util.regex.Pattern; public class StreetAddressStandardizer { private String m_secondaryAddress; private String m_streetNumber; private String m_streetPreDirectional; private String m_streetName; private String m_streetSuffix; private String m_streetPostDirectional; private String m_secondaryAddressRange; private String m_rawPreDirectional; private String m_error; private String m_rawDeliveryAddressLine; private String m_standardDeliveryAddressLine; private static final int STREET_NUMBER = 0; private static final int PRE_DIRECTIONAL = 1; private static final int STREET_NAME = 2; private static final int STREET_SUFFIX = 3; private static final int POST_DIRECTIONAL = 4; private static final int SECONDARY = 5; private static final int SECONDARY_RANGE = 6; private static final int END_OF_TOKENS = 7; private static final int STOP = 8; private String[] s_stageNames = { "STREET_NUMBER ", "PRE_DIRECTIONAL ", "STREET_NAME ", "STREET_SUFFIX ", "POST_DIRECTIONAL ", "SECONDARY ", "SECONDARY_RANGE ", "END_OF_TOKENS ", "STOP " }; private static Map DIRECTIONALS; private static Map SUFFIX_ABBR_MAP; // private static Map SECONDARY_INDICATOR_NO_RANGE_MAP; private static Map SECONDARY_INDICATOR_MAP; private static Pattern s_alphaNumeric; static { s_alphaNumeric = Pattern.compile( ".*"); try { SUFFIX_ABBR_MAP = loadStandardAddressPartMap( "StreetSuffix.txt" ); SECONDARY_INDICATOR_MAP = loadStandardAddressPartMap( "SecondaryUnitDesignators.txt" ); DIRECTIONALS = loadStandardAddressPartMap( "Directionals.txt" ); } catch (Exception x ) { x.printStackTrace(); } } private static Map loadStandardAddressPartMap( String a_resourceName ) throws IOException { Map map = new HashMap(); InputStreamReader isr = new InputStreamReader( StreetAddressStandardizer.class.getResourceAsStream( a_resourceName ) ); BufferedReader data = new BufferedReader( isr ); String line = null; while( (line = data.readLine()) != null ) { String[] components = line.split( ",", 4 ); if( components.length >= 3 ) { StandardAddressPart sap = new StandardAddressPart( components[0], components[2], components.length == 4 ? components[3] : null ); map.put( components[0], sap ); map.put( components[2], sap ); String[] abbreviations = components[1].split( " " ); for( int idx = 0; idx < abbreviations.length; idx++ ) { map.put( abbreviations[idx], sap ); } } else { System.out.println( "Bad Input: " + line ); } } return Collections.synchronizedMap( map ); } private static int getLastSuffixIndexPos(String[] a_parsedAddrElements) { for( int idx = a_parsedAddrElements.length; idx > 0; idx--) { if( SUFFIX_ABBR_MAP.get( a_parsedAddrElements[ idx - 1 ] ) != null ) { return idx - 1; } } return -1; } private static boolean isAlphaNumeric(String a_element) { return s_alphaNumeric.matcher( a_element ).matches(); } public StreetAddressStandardizer( String a_deliveryAddressLine ) { m_rawDeliveryAddressLine = a_deliveryAddressLine; try { parseDeliveryAddressLine( a_deliveryAddressLine ); m_standardDeliveryAddressLine = buildDeliveryAddressLine(); } catch (Exception x ) { x.printStackTrace(); m_error = x.getMessage(); } } /** * This method takes a USPS Delivery Address Line and parses the string into * street number, street name, pre-directional, post-directional, street * suffix, secondary address indicator, secondary address range. * * @param a_deliveryAddressLine * The USPS Delivery Address Line to parse and Standardize. Value * cannot be null. An empty String value just returns in essence * doing nothing. * @throws StandardizationException * Thrown when the address line is not in a valid format and * cannot be parsed. */ private void parseDeliveryAddressLine(String a_deliveryAddressLine) throws StandardizationException { // // … perform parameter validation and preprocessing steps // // // first make uppercase and parse the string based on white space // String[] parsedAddrElements = a_deliveryAddressLine.toUpperCase() .split("\\s|\\."); /* if( parsedAddrElements.length == 1 && parsedAddrElements[0].equals( "" )) { String[] empty = { }; parsedAddrElements = empty; }*/ // // finite state machine implementation of parsing Delivery Address Line // and setting StandardizedAddress attributes. // // Address line is processed from left to right // The beginning state is 0. // state 0: // Street Number ---> State 1 // state 1: // Pre-directional ---> State 2 // Street Name ---> State 3 // state 2: // Street Name ---> State 3 // state 3: // Street Name ---> State 3 // Suffix ---> State 4 // state 4: // Post-directional ---> State 5 // '#' ---> State 6 // 2nd Address Ind ---> State 6 // ---> State 7 // state 5: // '#' ---> State 6 // 2nd Address Ind ---> State 6 // ---> State 7 // state 6: // 2nd Address Ranges ---> State 7 // state 7: // ---> // int state = STREET_NUMBER; boolean continueParse = true; // // Determine the index position of the Last occurrance of a Street // suffix. // // System.out.println( parsedAddrElements.length ); int lastSuffix = getLastSuffixIndexPos(parsedAddrElements); for (int index = 0; index < parsedAddrElements.length; index++) { // System.out.println( index + ":" + s_stageNames[state] + ":" + parsedAddrElements[index] + ":["+getStreetNumber()+"]["+getPreDirectional()+"]["+getStreetName()+"]["+getStreetSuffix()+"]["+getPostDirectional()+"]["+getSecondaryAddressIndicator()+"]["+getSecondaryAddressRange()+"]"); if( parsedAddrElements[index].length() == 0 ) { continue; } switch (state) { case STREET_NUMBER: if (isAlphaNumeric(parsedAddrElements[index])) { setStreetNumber( parsedAddrElements[index] ); state = PRE_DIRECTIONAL; // // this default branch assumes an error if the first // token is not a numberic string. // } else { throw new StandardizationException("Invalid State " + state + " encountered processing token " + parsedAddrElements[index] + " in Delivery Address Line: " + a_deliveryAddressLine); } break; case PRE_DIRECTIONAL: if (DIRECTIONALS.get(parsedAddrElements[index]) != null) { setRawPreDirectional( parsedAddrElements[index] ); setPreDirectional( ((StandardAddressPart) DIRECTIONALS .get(parsedAddrElements[index])).getAbbreviation()); state = STREET_NAME; } else { appendStreetName( parsedAddrElements[index] ); state = STREET_SUFFIX; } break; case STREET_NAME: appendStreetName( parsedAddrElements[index] ); state = STREET_SUFFIX; break; case STREET_SUFFIX: // // If the element is a Suffix // abbreviation and it is the last Suffix parse the // street suffix, otherwise add to the street name. // if ((SUFFIX_ABBR_MAP.get(parsedAddrElements[index]) != null) && (lastSuffix == index)) { setStreetSuffix( ((StandardAddressPart) SUFFIX_ABBR_MAP .get(parsedAddrElements[index])).getAbbreviation()); state = POST_DIRECTIONAL; } else if (DIRECTIONALS.get(parsedAddrElements[index]) != null) { // re-process as a directional index--; state = POST_DIRECTIONAL; } else { appendStreetName( parsedAddrElements[index] ); state = STREET_SUFFIX; } break; case POST_DIRECTIONAL: if (SECONDARY_INDICATOR_MAP.get(parsedAddrElements[index]) != null) { StandardAddressPart p = (StandardAddressPart)SECONDARY_INDICATOR_MAP.get(parsedAddrElements[index]); setSecondaryAddressIndicator( p.getAbbreviation() ); if( "**".equals( p.getRule() )) { // no range state = END_OF_TOKENS; } else { // regular if( index < parsedAddrElements.length - 1 && SECONDARY_INDICATOR_MAP.get( parsedAddrElements[index + 1]) != null ) { // one indicator followed by another state = SECONDARY; } else { state = SECONDARY_RANGE; } } } else if (DIRECTIONALS.get(parsedAddrElements[index]) != null) { setPostDirectional( ((StandardAddressPart) DIRECTIONALS .get(parsedAddrElements[index])).getAbbreviation()); state = SECONDARY; if( getRawPreDirectional() != null ) { prependStreetName( getRawPreDirectional() ); setRawPreDirectional( null ); setPreDirectional( null ); } } /* else if ("#".equals(parsedAddrElements[index])) { setSecondaryAddressIndicator( parsedAddrElements[index] ); state = SECONDARY_RANGE; // // If a street suffix is encountered here then a street // suffix // was apart of the street name. Append the suffix to the // name // and set the suffix to the current token. // } */ else if (SUFFIX_ABBR_MAP.get(parsedAddrElements[index]) != null) { this.m_streetName += " " + parsedAddrElements[index - 1]; setStreetSuffix( ((StandardAddressPart) SUFFIX_ABBR_MAP .get(parsedAddrElements[index])).getAbbreviation()); state = POST_DIRECTIONAL; } else if (isAlphaNumeric(parsedAddrElements[index]) && (index == parsedAddrElements.length - 1)) { setSecondaryAddressIndicator( "#" ); setSecondaryAddressRange( parsedAddrElements[index] ); state = END_OF_TOKENS; } else { throw new StandardizationException("Invalid State " + state + " encountered processing token " + parsedAddrElements[index] + " in Delivery Address Line: " + a_deliveryAddressLine); } break; case SECONDARY: if (SECONDARY_INDICATOR_MAP.get(parsedAddrElements[index]) != null) { StandardAddressPart p = (StandardAddressPart)SECONDARY_INDICATOR_MAP.get(parsedAddrElements[index]); String tempIndicator = getSecondaryAddressIndicator(); if( tempIndicator != null && ! tempIndicator.equals( "" )) { // concatenate indicators - i.e. PO, BOX, PO BOX setSecondaryAddressIndicator( tempIndicator + " " + p.getAbbreviation() ); } else { setSecondaryAddressIndicator( p.getAbbreviation() ); } if( "**".equals( p.getRule() )) { // no range state = END_OF_TOKENS; } else { // regular if( index < parsedAddrElements.length - 1 && SECONDARY_INDICATOR_MAP.get( parsedAddrElements[index + 1]) != null ) { // one indicator followed by another state = SECONDARY; } else { state = SECONDARY_RANGE; } } } /* else if ("#".equals(parsedAddrElements[index])) { setSecondaryAddressIndicator( parsedAddrElements[index] ); state = SECONDARY_RANGE; // // throw exception if none of the cases above match // } */ else if (isAlphaNumeric(parsedAddrElements[index]) && (index == parsedAddrElements.length - 1)) { setSecondaryAddressIndicator( "#" ); setSecondaryAddressRange( parsedAddrElements[index] ); state = END_OF_TOKENS; } else { throw new StandardizationException("Invalid State " + state + " encountered processing token " + parsedAddrElements[index] + " in Delivery Address Line: " + a_deliveryAddressLine); } break; case SECONDARY_RANGE: setSecondaryAddressRange( parsedAddrElements[index] ); state = END_OF_TOKENS; break; case END_OF_TOKENS: // // This State 7 should really never be reached as the // loop should run out of tokens at the previous State. // state = STOP; continueParse = false; break; default: throw new StandardizationException("Invalid State encountered " + "parsing Delivery Address Line: " + a_deliveryAddressLine); } if (!continueParse) { break; } } } private class StandardizationException extends Exception { public StandardizationException(String aMessage) { super(aMessage); } } public String getSecondaryAddressIndicator() { return m_secondaryAddress; } public String getSecondaryAddressRange() { return m_secondaryAddressRange; } //public String getSecondaryAddrIndicator() { // return m_secondaryAddrIndicator; // } public String getStreetName() { return m_streetName; } public String getStreetNumber() { return m_streetNumber; } public String getPostDirectional() { return m_streetPostDirectional; } public String getPreDirectional() { return m_streetPreDirectional; } public String getStreetSuffix() { return m_streetSuffix; } private static class StandardAddressPart { private String m_rule; private String m_abbr; private String m_name; StandardAddressPart( String a_name, String a_abbr, String a_rule ) { m_rule = a_rule; m_abbr = a_abbr; m_name = a_name; } public String getAbbreviation() { return m_abbr; } public String getProperName() { return m_name; } public String getRule() { return m_rule; } } public String buildDeliveryAddressLine() { StringBuffer buffer = new StringBuffer(); if( getStreetNumber() != null ) { buffer.append( getStreetNumber() ); } if( getPreDirectional() != null ) { buffer.append( " " ).append( getPreDirectional() ); } if( getStreetName() != null ) { buffer.append( " " ).append( getStreetName() ); } if( getStreetSuffix() != null ) { buffer.append( " " ).append( getStreetSuffix() ); } if( getPostDirectional() != null ) { buffer.append( " " ).append( getPostDirectional() ); } if( getSecondaryAddressIndicator() != null ) { buffer.append( " " ).append( getSecondaryAddressIndicator() ); if( getSecondaryAddressRange() != null ) { buffer.append( " " ).append( getSecondaryAddressRange() ); } } return buffer.toString().trim(); } void setSecondaryAddressIndicator(String a_secondaryAddress) { m_secondaryAddress = a_secondaryAddress; } void setSecondaryAddressRange(String a_secondaryAddressRange) { m_secondaryAddressRange = a_secondaryAddressRange; } void setStreetNumber(String a_streetNumber) { m_streetNumber = a_streetNumber; } void setPostDirectional(String a_streetPostDirectional) { m_streetPostDirectional = a_streetPostDirectional; } void setPreDirectional(String a_streetPreDirectional) { m_streetPreDirectional = a_streetPreDirectional; } void setStreetSuffix(String a_streetSuffix) { m_streetSuffix = a_streetSuffix; } public String getRawPreDirectional() { return m_rawPreDirectional; } void setRawPreDirectional(String a_rawPreDirectional) { m_rawPreDirectional = a_rawPreDirectional; } public void appendStreetName( String a_name ) { if( this.m_streetName != null ) { this.m_streetName = this.m_streetName + " " + a_name; } else { this.m_streetName = a_name; } } public void prependStreetName( String a_name ) { if( this.m_streetName != null ) { this.m_streetName = a_name + " " + this.m_streetName; } else { this.m_streetName = a_name; } } public static void main( String[] a_args ) throws Exception { String testAddress = null; //testAddress = "16 Southeast Old Farm Road West 26"; // testAddress = "16 S Old Farm Road Suite 15"; //testAddress = "60 Sunset Ave."; // testAddress = "27 Southeast Freeway North"; // testAddress = "29 East Avenue WEst"; //testAddress = "4513 3RD STREET CIRCLE WEST"; //testAddress = "1919 East Main Avenue Drive West"; // testAddress = ""; testAddress = "610 HIGHWAY O, PO BOX 1068"; testAddress = "814 SPRINGER AVE, PO BOX 266"; testAddress = "PO BOX 307, 400 HARKINS LN"; testAddress = "2100 Stantonburg Road PO Box 6028"; testAddress = "610 HIGHWAY O, PO BOX 1068"; testAddress = "PO BOX 670"; testAddress = "27 Southeast Freeway North"; StreetAddressStandardizer sas = new StreetAddressStandardizer( testAddress ); // sas.parseDeliveryAddressLine( testAddress ); System.out.println( "Street Number : " + sas.getStreetNumber()); System.out.println( "Pre Directional : " + sas.getPreDirectional()); System.out.println( "Street Name : " + sas.getStreetName()); System.out.println( "Street Suffix : " + sas.getStreetSuffix()); System.out.println( "Post Directional : " + sas.getPostDirectional()); System.out.println( "Secondary Addr Ind : " + sas.getSecondaryAddressIndicator()); System.out.println( "Secondary Addr Range: " + sas.getSecondaryAddressRange()); System.out.println( "BEFORE: " + testAddress ); System.out.println( "AFTER: " + sas.getStandardDeliveryAddressLine() ); // System.out.println( "AFTER: " + sas.getStandardDeliveryAddressLine() != null ? sas.getStandardDeliveryAddressLine() : sas.getError() ); } public String getStandardDeliveryAddressLine() { return m_standardDeliveryAddressLine; } public String getRawDeliveryAddressLine() { return m_rawDeliveryAddressLine; } public String getError() { return m_error; } }