diff --git a/src/main/java/com/sforce/dataset/loader/file/schema/DetectFieldTypes.java b/src/main/java/com/sforce/dataset/loader/file/schema/DetectFieldTypes.java deleted file mode 100644 index 86f2db9..0000000 --- a/src/main/java/com/sforce/dataset/loader/file/schema/DetectFieldTypes.java +++ /dev/null @@ -1,415 +0,0 @@ -/* - * Copyright (c) 2014, salesforce.com, inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, are permitted provided - * that the following conditions are met: - * - * Redistributions of source code must retain the above copyright notice, this list of conditions and the - * following disclaimer. - * - * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and - * the following disclaimer in the documentation and/or other materials provided with the distribution. - * - * Neither the name of salesforce.com, inc. nor the names of its contributors may be used to endorse or - * promote products derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR - * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -package com.sforce.dataset.loader.file.schema; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.PrintStream; -import java.math.BigDecimal; -import java.math.RoundingMode; -import java.nio.charset.Charset; -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.HashSet; -import java.util.LinkedHashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Locale; - -import org.apache.commons.io.input.BOMInputStream; -import org.supercsv.io.CsvListReader; -import org.supercsv.prefs.CsvPreference; - -import com.sforce.dataset.util.DatasetUtils; - - -public class DetectFieldTypes { - - public static final int sampleSize = 3000; -// public static final Pattern dates = Pattern.compile("(.*)([0-9]{1,2}[/-\\\\.][0-9]{1,2}[/-\\\\.][0-9]{4}|[0-9]{4}[/-\\\\.][0-9]{1,2}[/-\\\\.][0-9]{1,2}|[0-9]{1,2}[/-\\\\.][0-9]{1,2}[/-\\\\.][0-9]{1,2}|[0-9]{1,2}[/-\\\\.][A-Z]{3}[/-\\\\.][0-9]{4}|[0-9]{4}[/-\\\\.][A-Z]{3}[/-\\\\.][0-9]{1,2}|[0-9]{1,2}[/-\\\\.][A-Z]{3}[/-\\\\.][0-9]{1,2})(.*)"); -// public static final Pattern numbers = Pattern.compile("^[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?$"); -// public static final Pattern text = Pattern.compile("^[a-zA-z0-9]*$"); - public static final String[] additionalDatePatterns = {"yyyy-MM-dd'T'HH:mm:ss.SSS'Z'","yyyy-MM-dd'T'HH:mm:ss'Z'","yyyy-MM-dd'T'HH:mm:ss.SSS","yyyy-MM-dd'T'HH:mm:ss","MM/dd/yyyy HH:mm:ss","MM/dd/yy HH:mm:ss","MM-dd-yyyy HH:mm:ss","MM-dd-yy HH:mm:ss","dd/MM/yyyy HH:mm:ss","dd/MM/yy HH:mm:ss","dd-MM-yyyy HH:mm:ss","dd-MM-yy HH:mm:ss","MM/dd/yyyy","MM/dd/yy","dd/MM/yy","dd/MM/yyyy","MM-dd-yyyy","MM-dd-yy","dd-MM-yyyy","dd-MM-yy","M/d/yyyy HH:mm:ss","M/d/yy HH:mm:ss","M-d-yyyy HH:mm:ss","M-d-yy HH:mm:ss","d/M/yyyy HH:mm:ss","d/M/yy HH:mm:ss","d-M-yyyy HH:mm:ss","d-M-yy HH:mm:ss","M/d/yy","M/d/yyyy","d/M/yy","d/M/yyyy","M-d-yy","M-d-yyyy","d-M-yy","d-M-yyyy","M/dd/yyyy HH:mm:ss","M/dd/yy HH:mm:ss","M-dd-yyyy HH:mm:ss","M-dd-yy HH:mm:ss","dd/M/yyyy HH:mm:ss","dd/M/yy HH:mm:ss","dd-M-yyyy HH:mm:ss","dd-M-yy HH:mm:ss","M/dd/yy","dd/M/yy","M-dd-yy","dd-M-yy","M/dd/yyyy","dd/M/yyyy","M-dd-yyyy","dd-M-yyyy","MM/d/yyyy HH:mm:ss","MM/d/yy HH:mm:ss","MM-d-yyyy HH:mm:ss","MM-d-yy HH:mm:ss","d/MM/yyyy HH:mm:ss","d/MM/yy HH:mm:ss","d-MM-yyyy HH:mm:ss","d-MM-yy HH:mm:ss","MM/d/yy","d/MM/yy","MM-d-yy","d-MM-yy","MM/d/yyyy","d/MM/yyyy","MM-d-yyyy","d-MM-yyyy"}; - - public LinkedList detect(File inputCsv, ExternalFileSchema userSchema, Charset fileCharset, PrintStream logger) throws IOException - { - CsvListReader reader = null; - LinkedList types = null; - String[] header = null; - try - { - reader = new CsvListReader(new InputStreamReader(new BOMInputStream(new FileInputStream(inputCsv), false), DatasetUtils.utf8Decoder(null , fileCharset)), CsvPreference.STANDARD_PREFERENCE); - header = reader.getHeader(true); - - if(reader!=null) - { - reader.close(); - reader = null; - } - - List nextLine = null; - types = new LinkedList(); - boolean uniqueColumnFound = false; - String devNames[] = ExternalFileSchema.createUniqueDevName(header); - boolean first = true; - for (int i=0; i< header.length; i++) - { - if(i==0) - { - if(header[i] != null && header[i].startsWith("#")) - header[i] = header[i].replace("#", ""); - } - boolean found = false; - if(userSchema != null) - { - LinkedList obj = userSchema.objects; - if(obj!= null && !obj.isEmpty()) - { - List fields = obj.get(0).fields; - if(fields!= null && !fields.isEmpty()) - { - for(FieldType field:fields) - { - if(field.getName().equals(devNames[i])) - { - types.add(field); - found = true; - break; - } - } - } - } - } - if(found) - continue; - - if(first) - { - logger.println("Detecting schema from csv file {"+ inputCsv +"} ..."); - first = false; - } - - - LinkedList columnValues = new LinkedList(); - HashSet uniqueColumnValues = new HashSet(); - int rowCount = 0; - logger.print("Column: "+ header[i]); - try - { - reader = new CsvListReader(new InputStreamReader(new BOMInputStream(new FileInputStream(inputCsv), false), DatasetUtils.utf8Decoder(null , fileCharset)), CsvPreference.STANDARD_PREFERENCE); - header = reader.getHeader(true); - - rowCount++; - while ((nextLine = reader.read()) != null) { - rowCount++; - if(i>=nextLine.size()) - continue; //This line does not have enough columns - if(nextLine.get(i) != null && !nextLine.get(i).trim().isEmpty()) - { - columnValues.add(nextLine.get(i).trim()); - uniqueColumnValues.add(nextLine.get(i).trim()); - } - if(columnValues.size()>=sampleSize || rowCount > 10000) - { - break; - } - } - }catch(Throwable t) - { - t.printStackTrace(); - }finally - { - if(reader!=null) - reader.close(); - reader = null; - } - logger.print(", "); - FieldType newField = null; - int prec = detectTextPrecision(uniqueColumnValues); - BigDecimal bd = detectNumeric(columnValues); - if(bd!=null && (uniqueColumnValues.size() == (rowCount-1)) && bd.scale() == 0) - { - bd = null; //this is a Numeric uniqueId therefore treat is Text/Dim - } - - if(bd!=null) - { - newField = FieldType.GetMeasureKeyDataType(devNames[i], 0, bd.scale(), 0L); - logger.println("Type: Numeric, Scale: "+ bd.scale()); - }else - { - SimpleDateFormat sdf = null; - sdf = detectDate(columnValues); - if(sdf!= null) - { - newField = FieldType.GetDateKeyDataType(devNames[i], sdf.toPattern(), null); - logger.println("Type: Date, Format: "+ sdf.toPattern()); - }else - { - newField = FieldType.GetStringKeyDataType(devNames[i], null, null); - if(!uniqueColumnFound && uniqueColumnValues.size() == (rowCount-1) && prec<32) - { - newField.isUniqueId = true; - uniqueColumnFound = true; - } - if(prec>255) - { - logger.println("Type: Text, Precison: "+255+" (Column will be truncated to 255 characters)" + (newField.isUniqueId? ", isUniqueId=true":"")); - } - else - { - logger.println("Type: Text, Precison: "+prec + (newField.isUniqueId? ", isUniqueId=true":"")); - } - newField.setPrecision(255); //Assume upper limit for precision of text fields even if the values may be smaller - } - } - if(newField!=null) - { - if(header[i]!=null && !header[i].trim().isEmpty()) - { - newField.setLabel(header[i]); - newField.setDescription(header[i]); - } - types.add(newField); - } - }//end for - - if(!first) - { - logger.println("Schema file {"+ ExternalFileSchema.getSchemaFile(inputCsv, logger) +"} successfully generated..."); - } - - } finally { - logger.println(""); - if(reader!=null) - reader.close(); - } - return types; - } - - public BigDecimal detectNumeric(LinkedList columnValues) - { - BigDecimal maxScale = null; - BigDecimal maxPrecision = null; - @SuppressWarnings("unused") - int failures = 0; - int success = 0; - int absoluteMaxScale = 9; - int absoluteMaxPrecision = 18; - - //Date dt = new Date(System.currentTimeMillis()); - for(int j=0;jabsoluteMaxPrecision) - continue; - - BigDecimal bd = null; - try - { - bd = new BigDecimal(columnValue); - if(bd.precision()>absoluteMaxPrecision || bd.scale()>absoluteMaxPrecision) - continue; - //logger.println("Value: {"+columnValue+"} Scale: {"+bd.scale()+"}"); - if(maxScale == null || bd.scale() > maxScale.scale()) - maxScale = bd; - if(maxPrecision == null || bd.precision() > maxPrecision.precision()) - maxPrecision = bd; - success++; - }catch(Throwable t) - { - failures++; - } - } - - if(maxPrecision!=null) - { - absoluteMaxScale = absoluteMaxPrecision - (maxPrecision.precision()-maxPrecision.scale()); - } - - if(absoluteMaxScale>9) - absoluteMaxScale=9; - else if(absoluteMaxScale<=2) - absoluteMaxScale=2; - - if(maxScale!=null && maxScale.scale()>absoluteMaxScale) - { - maxScale = maxScale.setScale(absoluteMaxScale, RoundingMode.HALF_EVEN); - } - - if((1.0*success/columnValues.size()) > 0.95) - { - return maxScale; - }else - { - return null; - } - } - - public SimpleDateFormat detectDate(LinkedList columnValues) - { - - LinkedHashSet dateFormats = getSuportedDateFormats(); - - for(int j=0;j30) - continue; - - for (SimpleDateFormat sdf:dateFormats) - { - try - { - dt = sdf.parse(columnValue); - String tmpDate = sdf.format(dt); - if(tmpDate.length() == columnValue.length()) - { - dtf = sdf; - break; - }else - dt=null; - }catch(Throwable t) - { -// if(dtf.toPattern().equals("MM/dd/yyyy hh:mm:ss a")) -// { -// logger.println(i + "| " + locales[i].getDisplayCountry()+"| "+dtf.toPattern()); -// logger.println(columnValue.trim()); -// t.printStackTrace(); -// } - dtf = null; - dt = null; - } - } - - if(dt!=null) - { - @SuppressWarnings("unused") - int failures = 0; - int success = 0; - for(int k=0;k 0.95) - { - return dtf; - }else - { - dateFormats.remove(dtf); //lets not try this format again - dtf = null; - dt = null; - } - } - } - return null; - - } - - public int detectTextPrecision(HashSet uniqueColumnValues) { - int length = 0; - for(String columnValue:uniqueColumnValues) - { - if(columnValue!=null) - { - if(columnValue.length()>length) - length = columnValue.length(); - } - } - return length; - } - - public LinkedHashSet getSuportedDateFormats() - { - LinkedHashSet dateFormats = new LinkedHashSet(); - Locale[] locales = Locale.getAvailableLocales(); - for (int i = 0; i < locales.length; i++) - { - if (locales[i].getCountry().length() == 0) { - continue; // Skip language-only locales - } - SimpleDateFormat sdf = (SimpleDateFormat) SimpleDateFormat.getDateTimeInstance(SimpleDateFormat.MEDIUM,SimpleDateFormat.MEDIUM, locales[i]); - SimpleDateFormat tempSdf = new SimpleDateFormat(sdf.toPattern()); - tempSdf.setLenient(false); - dateFormats.add(tempSdf); - } - - for (int i = 0; i < additionalDatePatterns.length; i++) - { - try - { - SimpleDateFormat sdf = new SimpleDateFormat(additionalDatePatterns[i]); - SimpleDateFormat tempSdf = new SimpleDateFormat(sdf.toPattern()); - tempSdf.setLenient(false); - dateFormats.add(tempSdf); - }catch(Throwable t1) - { - t1.printStackTrace(); - } - } - - for (int i = 0; i < locales.length; i++) - { - if (locales[i].getCountry().length() == 0) { - continue; // Skip language-only locales - } - SimpleDateFormat sdf = (SimpleDateFormat) SimpleDateFormat.getDateInstance(SimpleDateFormat.MEDIUM, locales[i]); - SimpleDateFormat tempSdf = new SimpleDateFormat(sdf.toPattern()); - tempSdf.setLenient(false); - dateFormats.add(tempSdf); - } - return dateFormats; - } - - - - -} -