From 564203397f52b3779e7f7edc62dda7359de620cb Mon Sep 17 00:00:00 2001 From: Puneet Gupta Date: Tue, 25 Nov 2014 09:56:34 -0800 Subject: [PATCH] fixed an edge condition in data detection --- .../loader/file/schema/DetectFieldTypes.java | 133 ++++++++---------- 1 file changed, 58 insertions(+), 75 deletions(-) diff --git a/src/main/java/com/sforce/dataset/loader/file/schema/DetectFieldTypes.java b/src/main/java/com/sforce/dataset/loader/file/schema/DetectFieldTypes.java index c90e904..0832b60 100644 --- a/src/main/java/com/sforce/dataset/loader/file/schema/DetectFieldTypes.java +++ b/src/main/java/com/sforce/dataset/loader/file/schema/DetectFieldTypes.java @@ -35,6 +35,7 @@ import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; +import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Locale; @@ -233,17 +234,15 @@ public BigDecimal detectNumeric(LinkedList columnValues) return null; } } - - - public SimpleDateFormat detectDate(LinkedList columnValues) { - - Locale[] locales = Locale.getAvailableLocales(); + public SimpleDateFormat detectDate(LinkedList columnValues) + { @SuppressWarnings("unused") int failures = 0; int success = 0; + + LinkedHashSet dateFormats = getSuportedDateFormats(); - //Date dt = new Date(System.currentTimeMillis()); for(int j=0;j columnValues) { else columnValue = columnValue.trim(); - for (int i = 0; i < locales.length; i++) + for (SimpleDateFormat sdf:dateFormats) { - if (locales[i].getCountry().length() == 0) { - continue; // Skip language-only locales - } - - //System.out.print(i + "| " + locales[i].getDisplayCountry()+ "| "); - SimpleDateFormat sdf = (SimpleDateFormat) SimpleDateFormat.getDateTimeInstance(SimpleDateFormat.MEDIUM,SimpleDateFormat.MEDIUM, locales[i]); - dtf = new SimpleDateFormat(sdf.toPattern()); - dtf.setLenient(false); try { - dt = dtf.parse(columnValue.trim()); - String tmpDate = dtf.format(dt); - if(tmpDate.equalsIgnoreCase(columnValue)) + dt = sdf.parse(columnValue); + String tmpDate = sdf.format(dt); + if(tmpDate.length() == columnValue.length()) { - //System.out.println("Format:{"+dtf.toPattern()+"} value:{"+columnValue+"}"); + dtf = sdf; break; }else dt=null; }catch(Throwable t) { +// if(dtf.toPattern().equals("MM/dd/yyyy hh:mm:ss a")) +// { +// System.out.println(i + "| " + locales[i].getDisplayCountry()+"| "+dtf.toPattern()); +// System.out.println(columnValue.trim()); +// t.printStackTrace(); +// } } - }//end for-i - - if(dt==null) - { - for (int i = 0; i < additionalDatePatterns.length; i++) - { - try - { - dtf = new SimpleDateFormat(additionalDatePatterns[i]); - dtf.setLenient(false); - dt = dtf.parse(columnValue.trim()); - String tmpDate = dtf.format(dt); - if(tmpDate.equalsIgnoreCase(columnValue)) - { - //System.out.println("Format:{"+dtf.toPattern()+"} value:{"+columnValue+"}"); - break; - }else - dt=null; - }catch(Throwable t1) - { - - } - } } - if(dt==null) - { - for (int i = 0; i < locales.length; i++) - { - if (locales[i].getCountry().length() == 0) { - continue; // Skip language-only locales - } - - //System.out.print(i + "| " + locales[i].getDisplayCountry()+ "| "); - SimpleDateFormat sdf = (SimpleDateFormat) SimpleDateFormat.getDateInstance(SimpleDateFormat.MEDIUM, locales[i]); - dtf = new SimpleDateFormat(sdf.toPattern()); - dtf.setLenient(false); - //System.out.println(dtf.toPattern()); - try - { - dt = dtf.parse(columnValue.trim()); - String tmpDate = dtf.format(dt); - if(tmpDate.equalsIgnoreCase(columnValue)) - { - //System.out.println("Format:{"+dtf.toPattern()+"} value:{"+columnValue+"}"); - break; - }else - dt=null; - }catch(Throwable t) - { - } - } - } - - - if(dt!=null) { for(int k=0;k columnValues) { columnValue = columnValue.trim(); try { - Date dt1 = dtf.parse(columnValue.trim()); + Date dt1 = dtf.parse(columnValue); String tmpDate = dtf.format(dt1); - if(tmpDate.equalsIgnoreCase(columnValue)) + if(tmpDate.length() == columnValue.length()) { success++; } @@ -357,6 +300,9 @@ public SimpleDateFormat detectDate(LinkedList columnValues) { if((1.0*success/columnValues.size()) > 0.85) { return dtf; + }else + { + dateFormats.remove(dtf); //lets not try this format again } } } @@ -377,6 +323,43 @@ public int detectTextPrecision(LinkedList columnValues) { } return length; } + + public LinkedHashSet getSuportedDateFormats() + { + LinkedHashSet dateFormats = new LinkedHashSet(); + Locale[] locales = Locale.getAvailableLocales(); + for (int i = 0; i < locales.length; i++) + { + if (locales[i].getCountry().length() == 0) { + continue; // Skip language-only locales + } + SimpleDateFormat sdf = (SimpleDateFormat) SimpleDateFormat.getDateTimeInstance(SimpleDateFormat.MEDIUM,SimpleDateFormat.MEDIUM, locales[i]); + dateFormats.add(new SimpleDateFormat(sdf.toPattern())); + } + + for (int i = 0; i < additionalDatePatterns.length; i++) + { + try + { + SimpleDateFormat sdf = new SimpleDateFormat(additionalDatePatterns[i]); + dateFormats.add(new SimpleDateFormat(sdf.toPattern())); + }catch(Throwable t1) + { + t1.printStackTrace(); + } + } + + for (int i = 0; i < locales.length; i++) + { + if (locales[i].getCountry().length() == 0) { + continue; // Skip language-only locales + } + SimpleDateFormat sdf = (SimpleDateFormat) SimpleDateFormat.getDateInstance(SimpleDateFormat.MEDIUM, locales[i]); + dateFormats.add(new SimpleDateFormat(sdf.toPattern())); + } + return dateFormats; + } +