Skip to content

Commit

Permalink
fixed an edge condition in data detection
Browse files Browse the repository at this point in the history
  • Loading branch information
datasetutil committed Nov 25, 2014
1 parent cf42983 commit 5642033
Showing 1 changed file with 58 additions and 75 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
Expand Down Expand Up @@ -233,17 +234,15 @@ public BigDecimal detectNumeric(LinkedList<String> columnValues)
return null;
}
}



public SimpleDateFormat detectDate(LinkedList<String> columnValues) {

Locale[] locales = Locale.getAvailableLocales();
public SimpleDateFormat detectDate(LinkedList<String> columnValues)
{
@SuppressWarnings("unused")
int failures = 0;
int success = 0;

LinkedHashSet<SimpleDateFormat> dateFormats = getSuportedDateFormats();

//Date dt = new Date(System.currentTimeMillis());
for(int j=0;j<columnValues.size();j++)
{
String columnValue = columnValues.get(j);
Expand All @@ -254,85 +253,29 @@ public SimpleDateFormat detectDate(LinkedList<String> columnValues) {
else
columnValue = columnValue.trim();

for (int i = 0; i < locales.length; i++)
for (SimpleDateFormat sdf:dateFormats)
{
if (locales[i].getCountry().length() == 0) {
continue; // Skip language-only locales
}

//System.out.print(i + "| " + locales[i].getDisplayCountry()+ "| ");
SimpleDateFormat sdf = (SimpleDateFormat) SimpleDateFormat.getDateTimeInstance(SimpleDateFormat.MEDIUM,SimpleDateFormat.MEDIUM, locales[i]);
dtf = new SimpleDateFormat(sdf.toPattern());
dtf.setLenient(false);
try
{
dt = dtf.parse(columnValue.trim());
String tmpDate = dtf.format(dt);
if(tmpDate.equalsIgnoreCase(columnValue))
dt = sdf.parse(columnValue);
String tmpDate = sdf.format(dt);
if(tmpDate.length() == columnValue.length())
{
//System.out.println("Format:{"+dtf.toPattern()+"} value:{"+columnValue+"}");
dtf = sdf;
break;
}else
dt=null;
}catch(Throwable t)
{
// if(dtf.toPattern().equals("MM/dd/yyyy hh:mm:ss a"))
// {
// System.out.println(i + "| " + locales[i].getDisplayCountry()+"| "+dtf.toPattern());
// System.out.println(columnValue.trim());
// t.printStackTrace();
// }
}
}//end for-i

if(dt==null)
{
for (int i = 0; i < additionalDatePatterns.length; i++)
{
try
{
dtf = new SimpleDateFormat(additionalDatePatterns[i]);
dtf.setLenient(false);
dt = dtf.parse(columnValue.trim());
String tmpDate = dtf.format(dt);
if(tmpDate.equalsIgnoreCase(columnValue))
{
//System.out.println("Format:{"+dtf.toPattern()+"} value:{"+columnValue+"}");
break;
}else
dt=null;
}catch(Throwable t1)
{

}
}
}

if(dt==null)
{
for (int i = 0; i < locales.length; i++)
{
if (locales[i].getCountry().length() == 0) {
continue; // Skip language-only locales
}

//System.out.print(i + "| " + locales[i].getDisplayCountry()+ "| ");
SimpleDateFormat sdf = (SimpleDateFormat) SimpleDateFormat.getDateInstance(SimpleDateFormat.MEDIUM, locales[i]);
dtf = new SimpleDateFormat(sdf.toPattern());
dtf.setLenient(false);
//System.out.println(dtf.toPattern());
try
{
dt = dtf.parse(columnValue.trim());
String tmpDate = dtf.format(dt);
if(tmpDate.equalsIgnoreCase(columnValue))
{
//System.out.println("Format:{"+dtf.toPattern()+"} value:{"+columnValue+"}");
break;
}else
dt=null;
}catch(Throwable t)
{
}
}
}



if(dt!=null)
{
for(int k=0;k<columnValues.size();k++)
Expand All @@ -344,9 +287,9 @@ public SimpleDateFormat detectDate(LinkedList<String> columnValues) {
columnValue = columnValue.trim();

try {
Date dt1 = dtf.parse(columnValue.trim());
Date dt1 = dtf.parse(columnValue);
String tmpDate = dtf.format(dt1);
if(tmpDate.equalsIgnoreCase(columnValue))
if(tmpDate.length() == columnValue.length())
{
success++;
}
Expand All @@ -357,6 +300,9 @@ public SimpleDateFormat detectDate(LinkedList<String> columnValues) {
if((1.0*success/columnValues.size()) > 0.85)
{
return dtf;
}else
{
dateFormats.remove(dtf); //lets not try this format again
}
}
}
Expand All @@ -377,6 +323,43 @@ public int detectTextPrecision(LinkedList<String> columnValues) {
}
return length;
}

public LinkedHashSet<SimpleDateFormat> getSuportedDateFormats()
{
LinkedHashSet<SimpleDateFormat> dateFormats = new LinkedHashSet<SimpleDateFormat>();
Locale[] locales = Locale.getAvailableLocales();
for (int i = 0; i < locales.length; i++)
{
if (locales[i].getCountry().length() == 0) {
continue; // Skip language-only locales
}
SimpleDateFormat sdf = (SimpleDateFormat) SimpleDateFormat.getDateTimeInstance(SimpleDateFormat.MEDIUM,SimpleDateFormat.MEDIUM, locales[i]);
dateFormats.add(new SimpleDateFormat(sdf.toPattern()));
}

for (int i = 0; i < additionalDatePatterns.length; i++)
{
try
{
SimpleDateFormat sdf = new SimpleDateFormat(additionalDatePatterns[i]);
dateFormats.add(new SimpleDateFormat(sdf.toPattern()));
}catch(Throwable t1)
{
t1.printStackTrace();
}
}

for (int i = 0; i < locales.length; i++)
{
if (locales[i].getCountry().length() == 0) {
continue; // Skip language-only locales
}
SimpleDateFormat sdf = (SimpleDateFormat) SimpleDateFormat.getDateInstance(SimpleDateFormat.MEDIUM, locales[i]);
dateFormats.add(new SimpleDateFormat(sdf.toPattern()));
}
return dateFormats;
}




Expand Down

0 comments on commit 5642033

Please sign in to comment.