diff --git a/pom.xml b/pom.xml index efa8c991..2474b218 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ com.github.heideltime heideltime - 2.2.1 + 2.2.2-SNAPSHOT HeidelTime HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard. @@ -24,6 +24,8 @@ UTF-8 + true + true @@ -50,6 +52,8 @@ src ${basedir}/class + test + ${basedir}/testclass ${basedir} @@ -70,8 +74,17 @@ maven-compiler-plugin 3.1 - 1.7 - 1.7 + 1.8 + 1.8 + + + + org.apache.maven.plugins + maven-dependency-plugin + + + ${project.build.directory}/lib + @@ -183,21 +196,21 @@ org.apache.uima uimaj-core - 2.8.1 + 2.10.2 provided edu.stanford.nlp stanford-corenlp - 3.3.1 + 3.8.0 provided args4j args4j - 2.32 + 2.33 provided @@ -206,5 +219,32 @@ 0.1 provided + + + org.slf4j + slf4j-api + 1.7.25 + provided + + + + ch.qos.logback + logback-core + [1.2.3,) + provided + + + ch.qos.logback + logback-classic + [1.2.3,) + provided + + + + junit + junit + [4.12,5) + test + diff --git a/resources/english/normalization/resources_normalization_normDay.txt b/resources/english/normalization/resources_normalization_normDay.txt index 1c161e02..fbaa332d 100644 --- a/resources/english/normalization/resources_normalization_normDay.txt +++ b/resources/english/normalization/resources_normalization_normDay.txt @@ -1,52 +1,42 @@ // author: Jannik Strötgen -// email: stroetgen@uni-hd.de +// email: stroetgen@uni-hd\.de // date: 2011-06-10 // This file contains "day words" and their normalized expressions -// according to TIMEX3 format. +// according to TIMEX3 format\. // For example, the normalized value of "first" is "01" // FORMAT: "day-word","normalized-day-word" -"0","00" -"00","00" -"1","01" -"01","01" -"2","02" -"02","02" -"3","03" -"03","03" -"4","04" -"04","04" -"5","05" -"05","05" -"6","06" -"06","06" -"7","07" -"07","07" -"8","08" -"08","08" -"9","09" -"09","09" -"10","10" -"11","11" -"12","12" -"13","13" -"14","14" -"15","15" -"16","16" -"17","17" -"18","18" -"19","19" -"20","20" -"21","21" -"22","22" -"23","23" -"24","24" -"25","25" -"26","26" -"27","27" -"28","28" -"29","29" -"30","30" -"31","31" +"00?\.?","00" +"0?1\.?","01" +"0?2\.?","02" +"0?3\.?","03" +"0?4\.?","04" +"0?5\.?","05" +"0?6\.?","06" +"0?7\.?","07" +"0?8\.?","08" +"0?9\.?","09" +"10\.?","10" +"11\.?","11" +"12\.?","12" +"13\.?","13" +"14\.?","14" +"15\.?","15" +"16\.?","16" +"17\.?","17" +"18\.?","18" +"19\.?","19" +"20\.?","20" +"21\.?","21" +"22\.?","22" +"23\.?","23" +"24\.?","24" +"25\.?","25" +"26\.?","26" +"27\.?","27" +"28\.?","28" +"29\.?","29" +"30\.?","30" +"31\.?","31" // "first","01" "second","02" @@ -115,15 +105,15 @@ "Thirtieth","30" "Thirty-first","31" // -"1st","01" -"2nd","02" -"3rd","03" -"4th","04" -"5th","05" -"6th","06" -"7th","07" -"8th","08" -"9th","09" +"0?1st","01" +"0?2nd","02" +"0?3rd","03" +"0?4th","04" +"0?5th","05" +"0?6th","06" +"0?7th","07" +"0?8th","08" +"0?9th","09" "10th","10" "11th","11" "12th","12" diff --git a/resources/english/normalization/resources_normalization_normDayInWeek.txt b/resources/english/normalization/resources_normalization_normDayInWeek.txt index d0385a18..0c5817ce 100644 --- a/resources/english/normalization/resources_normalization_normDayInWeek.txt +++ b/resources/english/normalization/resources_normalization_normDayInWeek.txt @@ -19,3 +19,5 @@ "Friday","5" "Saturday","6" "Sunday","7" +// Popular spelling mistakes +"[Ww]e[dn][nd]e?sday","3" diff --git a/resources/english/normalization/resources_normalization_normDurationNumber.txt b/resources/english/normalization/resources_normalization_normDurationNumber.txt index c514cc9d..bab75f98 100644 --- a/resources/english/normalization/resources_normalization_normDurationNumber.txt +++ b/resources/english/normalization/resources_normalization_normDurationNumber.txt @@ -1,10 +1,10 @@ -// author: Jannik Strötgen -// email: stroetgen@uni-hd.de -// date: 2011-06-10 -// This file contains "duration numbers" and their normalized expressions -// according to TIMEX3 format. -// For example, the normalized value of "one" is "1" -// FORMAT: "duration-number","normalized-duration-number" +//[ -]?author: Jannik Strötgen +//[ -]?email: stroetgen@uni-hd.de +//[ -]?date: 2011-06-10 +//[ -]?This file contains "duration numbers" and their normalized expressions +//[ -]?according to TIMEX3 format. +//[ -]?For example, the normalized value of "one" is "1" +//[ -]?FORMAT: "duration-number","normalized-duration-number" "0","0" "00","0" "1","1" @@ -47,7 +47,7 @@ "29","29" "30","30" "31","31" -// normal numbers +//[ -]?normal numbers "one","1" "two","2" "three","3" @@ -68,158 +68,86 @@ "eighteen","18" "nineteen","19" "twenty","20" -"twenty-one","21" -"twenty-two","22" -"twenty-three","23" -"twenty-four","24" -"twenty-five","25" -"twenty-six","26" -"twenty-seven","27" -"twenty-eight","28" -"twenty-nine","29" -"twenty one","21" -"twenty two","22" -"twenty three","23" -"twenty four","24" -"twenty five","25" -"twenty six","26" -"twenty seven","27" -"twenty eight","28" -"twenty nine","29" +"twenty[ -]?one","21" +"twenty[ -]?two","22" +"twenty[ -]?three","23" +"twenty[ -]?four","24" +"twenty[ -]?five","25" +"twenty[ -]?six","26" +"twenty[ -]?seven","27" +"twenty[ -]?eight","28" +"twenty[ -]?nine","29" "thirty","30" -"thirty-one","31" -"thirty-two","32" -"thirty-three","33" -"thirty-four","34" -"thirty-five","35" -"thirty-six","36" -"thirty-seven","37" -"thirty-eight","38" -"thirty-nine","39" -"thirty one","31" -"thirty two","32" -"thirty three","33" -"thirty four","34" -"thirty five","35" -"thirty six","36" -"thirty seven","37" -"thirty eight","38" -"thirty nine","39" +"thirty[ -]?one","31" +"thirty[ -]?two","32" +"thirty[ -]?three","33" +"thirty[ -]?four","34" +"thirty[ -]?five","35" +"thirty[ -]?six","36" +"thirty[ -]?seven","37" +"thirty[ -]?eight","38" +"thirty[ -]?nine","39" "forty","40" -"forty-one","41" -"forty-two","42" -"forty-three","43" -"forty-four","44" -"forty-five","45" -"forty-six","46" -"forty-seven","47" -"forty-eight","48" -"forty-nine","49" -"forty one","41" -"forty two","42" -"forty three","43" -"forty four","44" -"forty five","45" -"forty six","46" -"forty seven","47" -"forty eight","48" -"forty nine","49" +"forty[ -]?one","41" +"forty[ -]?two","42" +"forty[ -]?three","43" +"forty[ -]?four","44" +"forty[ -]?five","45" +"forty[ -]?six","46" +"forty[ -]?seven","47" +"forty[ -]?eight","48" +"forty[ -]?nine","49" "fifty","50" -"fifty-one","51" -"fifty-two","52" -"fifty-three","53" -"fifty-four","54" -"fifty-five","55" -"fifty-six","56" -"fifty-seven","57" -"fifty-eight","58" -"fifty-nine","59" -"fifty one","51" -"fifty two","52" -"fifty three","53" -"fifty four","54" -"fifty five","55" -"fifty six","56" -"fifty seven","57" -"fifty eight","58" -"fifty nine","59" +"fifty[ -]?one","51" +"fifty[ -]?two","52" +"fifty[ -]?three","53" +"fifty[ -]?four","54" +"fifty[ -]?five","55" +"fifty[ -]?six","56" +"fifty[ -]?seven","57" +"fifty[ -]?eight","58" +"fifty[ -]?nine","59" "sixty","60" -"sixty-one","61" -"sixty-two","62" -"sixty-three","63" -"sixty-four","64" -"sixty-five","65" -"sixty-six","66" -"sixty-seven","67" -"sixty-eight","68" -"sixty-nine","69" -"sixty one","61" -"sixty two","62" -"sixty three","63" -"sixty four","64" -"sixty five","65" -"sixty six","66" -"sixty seven","67" -"sixty eight","68" -"sixty nine","69" +"sixty[ -]?one","61" +"sixty[ -]?two","62" +"sixty[ -]?three","63" +"sixty[ -]?four","64" +"sixty[ -]?five","65" +"sixty[ -]?six","66" +"sixty[ -]?seven","67" +"sixty[ -]?eight","68" +"sixty[ -]?nine","69" "seventy","70" -"seventy-one","71" -"seventy-two","72" -"seventy-three","73" -"seventy-four","74" -"seventy-five","75" -"seventy-six","76" -"seventy-seven","77" -"seventy-eight","78" -"seventy-nine","79" -"seventy one","71" -"seventy two","72" -"seventy three","73" -"seventy four","74" -"seventy five","75" -"seventy six","76" -"seventy seven","77" -"seventy eight","78" -"seventy nine","79" +"seventy[ -]?one","71" +"seventy[ -]?two","72" +"seventy[ -]?three","73" +"seventy[ -]?four","74" +"seventy[ -]?five","75" +"seventy[ -]?six","76" +"seventy[ -]?seven","77" +"seventy[ -]?eight","78" +"seventy[ -]?nine","79" "eighty","80" -"eighty-one","81" -"eighty-two","82" -"eighty-three","83" -"eighty-four","84" -"eighty-five","85" -"eighty-six","86" -"eighty-seven","87" -"eighty-eight","88" -"eighty-nine","89" -"eighty one","81" -"eighty two","82" -"eighty three","83" -"eighty four","84" -"eighty five","85" -"eighty six","86" -"eighty seven","87" -"eighty eight","88" -"eighty nine","89" +"eighty[ -]?one","81" +"eighty[ -]?two","82" +"eighty[ -]?three","83" +"eighty[ -]?four","84" +"eighty[ -]?five","85" +"eighty[ -]?six","86" +"eighty[ -]?seven","87" +"eighty[ -]?eight","88" +"eighty[ -]?nine","89" "ninety","90" -"ninety-one","91" -"ninety-two","92" -"ninety-three","93" -"ninety-four","94" -"ninety-five","95" -"ninety-six","96" -"ninety-seven","97" -"ninety-eight","98" -"ninety-nine","99" -"ninety one","91" -"ninety two","92" -"ninety three","93" -"ninety four","94" -"ninety five","95" -"ninety six","96" -"ninety seven","97" -"ninety eight","98" -"ninety nine","99" -// UPPER CASE +"ninety[ -]?one","91" +"ninety[ -]?two","92" +"ninety[ -]?three","93" +"ninety[ -]?four","94" +"ninety[ -]?five","95" +"ninety[ -]?six","96" +"ninety[ -]?seven","97" +"ninety[ -]?eight","98" +"ninety[ -]?nine","99" +//[ -]?UPPER CASE "One","1" "Two","2" "Three","3" @@ -240,155 +168,83 @@ "Eighteen","18" "Nineteen","19" "Twenty","20" -"Twenty-one","21" -"Twenty-two","22" -"Twenty-three","23" -"Twenty-four","24" -"Twenty-five","25" -"Twenty-six","26" -"Twenty-seven","27" -"Twenty-eight","28" -"Twenty-nine","29" -"Twenty one","21" -"Twenty two","22" -"Twenty three","23" -"Twenty four","24" -"Twenty five","25" -"Twenty six","26" -"Twenty seven","27" -"Twenty eight","28" -"Twenty nine","29" +"Twenty[ -]?one","21" +"Twenty[ -]?two","22" +"Twenty[ -]?three","23" +"Twenty[ -]?four","24" +"Twenty[ -]?five","25" +"Twenty[ -]?six","26" +"Twenty[ -]?seven","27" +"Twenty[ -]?eight","28" +"Twenty[ -]?nine","29" "Thirty","30" -"Thirty-one","31" -"Thirty-two","32" -"Thirty-three","33" -"Thirty-four","34" -"Thirty-five","35" -"Thirty-six","36" -"Thirty-seven","37" -"Thirty-eight","38" -"Thirty-nine","39" -"Thirty one","31" -"Thirty two","32" -"Thirty three","33" -"Thirty four","34" -"Thirty five","35" -"Thirty six","36" -"Thirty seven","37" -"Thirty eight","38" -"Thirty nine","39" +"Thirty[ -]?one","31" +"Thirty[ -]?two","32" +"Thirty[ -]?three","33" +"Thirty[ -]?four","34" +"Thirty[ -]?five","35" +"Thirty[ -]?six","36" +"Thirty[ -]?seven","37" +"Thirty[ -]?eight","38" +"Thirty[ -]?nine","39" "Forty","40" -"Forty-one","41" -"Forty-two","42" -"Forty-three","43" -"Forty-four","44" -"Forty-five","45" -"Forty-six","46" -"Forty-seven","47" -"Forty-eight","48" -"Forty-nine","49" -"Forty one","41" -"Forty two","42" -"Forty three","43" -"Forty four","44" -"Forty five","45" -"Forty six","46" -"Forty seven","47" -"Forty eight","48" -"Forty nine","49" +"Forty[ -]?one","41" +"Forty[ -]?two","42" +"Forty[ -]?three","43" +"Forty[ -]?four","44" +"Forty[ -]?five","45" +"Forty[ -]?six","46" +"Forty[ -]?seven","47" +"Forty[ -]?eight","48" +"Forty[ -]?nine","49" "Fifty","50" -"Fifty-one","51" -"Fifty-two","52" -"Fifty-three","53" -"Fifty-four","54" -"Fifty-five","55" -"Fifty-six","56" -"Fifty-seven","57" -"Fifty-eight","58" -"Fifty-nine","59" -"Fifty one","51" -"Fifty two","52" -"Fifty three","53" -"Fifty four","54" -"Fifty five","55" -"Fifty six","56" -"Fifty seven","57" -"Fifty eight","58" -"Fifty nine","59" +"Fifty[ -]?one","51" +"Fifty[ -]?two","52" +"Fifty[ -]?three","53" +"Fifty[ -]?four","54" +"Fifty[ -]?five","55" +"Fifty[ -]?six","56" +"Fifty[ -]?seven","57" +"Fifty[ -]?eight","58" +"Fifty[ -]?nine","59" "Sixty","60" -"Sixty-one","61" -"Sixty-two","62" -"Sixty-three","63" -"Sixty-four","64" -"Sixty-five","65" -"Sixty-six","66" -"Sixty-seven","67" -"Sixty-eight","68" -"Sixty-nine","69" -"Sixty one","61" -"Sixty two","62" -"Sixty three","63" -"Sixty four","64" -"Sixty five","65" -"Sixty six","66" -"Sixty seven","67" -"Sixty eight","68" -"Sixty nine","69" +"Sixty[ -]?one","61" +"Sixty[ -]?two","62" +"Sixty[ -]?three","63" +"Sixty[ -]?four","64" +"Sixty[ -]?five","65" +"Sixty[ -]?six","66" +"Sixty[ -]?seven","67" +"Sixty[ -]?eight","68" +"Sixty[ -]?nine","69" "Seventy","70" -"Seventy-one","71" -"Seventy-two","72" -"Seventy-three","73" -"Seventy-four","74" -"Seventy-five","75" -"Seventy-six","76" -"Seventy-seven","77" -"Seventy-eight","78" -"Seventy-nine","79" -"Seventy one","71" -"Seventy two","72" -"Seventy three","73" -"Seventy four","74" -"Seventy five","75" -"Seventy six","76" -"Seventy seven","77" -"Seventy eight","78" -"Seventy nine","79" +"Seventy[ -]?one","71" +"Seventy[ -]?two","72" +"Seventy[ -]?three","73" +"Seventy[ -]?four","74" +"Seventy[ -]?five","75" +"Seventy[ -]?six","76" +"Seventy[ -]?seven","77" +"Seventy[ -]?eight","78" +"Seventy[ -]?nine","79" "Eighty","80" -"Eighty-one","81" -"Eighty-two","82" -"Eighty-three","83" -"Eighty-four","84" -"Eighty-five","85" -"Eighty-six","86" -"Eighty-seven","87" -"Eighty-eight","88" -"Eighty-nine","89" -"Eighty one","81" -"Eighty two","82" -"Eighty three","83" -"Eighty four","84" -"Eighty five","85" -"Eighty six","86" -"Eighty seven","87" -"Eighty eight","88" -"Eighty nine","89" +"Eighty[ -]?one","81" +"Eighty[ -]?two","82" +"Eighty[ -]?three","83" +"Eighty[ -]?four","84" +"Eighty[ -]?five","85" +"Eighty[ -]?six","86" +"Eighty[ -]?seven","87" +"Eighty[ -]?eight","88" +"Eighty[ -]?nine","89" "Ninety","90" -"Ninety-one","91" -"Ninety-two","92" -"Ninety-three","93" -"Ninety-four","94" -"Ninety-five","95" -"Ninety-six","96" -"Ninety-seven","97" -"Ninety-eight","98" -"Ninety-nine","99" -"Ninety one","91" -"Ninety two","92" -"Ninety three","93" -"Ninety four","94" -"Ninety five","95" -"Ninety six","96" -"Ninety seven","97" -"Ninety eight","98" -"Ninety nine","99" +"Ninety[ -]?one","91" +"Ninety[ -]?two","92" +"Ninety[ -]?three","93" +"Ninety[ -]?four","94" +"Ninety[ -]?five","95" +"Ninety[ -]?six","96" +"Ninety[ -]?seven","97" +"Ninety[ -]?eight","98" +"Ninety[ -]?nine","99" diff --git a/resources/english/normalization/resources_normalization_normHolidayFix.txt b/resources/english/normalization/resources_normalization_normHolidayFix.txt index 524ecf1a..c5928ec8 100644 --- a/resources/english/normalization/resources_normalization_normHolidayFix.txt +++ b/resources/english/normalization/resources_normalization_normHolidayFix.txt @@ -8,62 +8,48 @@ // the reference of the values is given in the repattern file // http://en.wikipedia.org/wiki/New_Year%27s_Day -"New Year's Day","01-01" +"New Year'?s [Dd]ay","01-01" // http://en.wikipedia.org/wiki/Groundhog_Day -"Groundhog Day","02-02" +"Groundhog [Dd]ay","02-02" // http://en.wikipedia.org/wiki/Valentine%27s_Day -"Valentines Day","02-14" -"Saint Valentines Day","02-14" -"St. Valentines Day","02-14" -"Valentine's Day","02-14" -"Saint Valentine's Day","02-14" -"St. Valentine's Day","02-14" -"Valentines' Day","02-14" -"Saint Valentines' Day","02-14" -"St. Valentines' Day","02-14" +"(?:Saint |St\. )?Valentine'?s'? [Dd]ay","02-14" // http://en.wikipedia.org/wiki/Patriots%27_Day -"Patriots Day","09-11" -"Patriots' Day","09-11" -"Patriot's Day","09-11" +"Patriot'?s'? [Dd]ay","09-11" // http://en.wikipedia.org/wiki/German-American_Day -"German-American Day","10-06" +"German-American [Dd]ay","10-06" // http://en.wikipedia.org/wiki/White_Cane_Safety_Day -"White Cane Safety Day","10-15" +"White Cane Safety [Dd]ay","10-15" // http://en.wikipedia.org/wiki/Boss%27s_Day -"Boss's Day","10-16" -"Bosses Day","10-16" -"Bosses' Day","10-16" +"Boss(?:'s|es|es') [Dd]ay","10-16" //won't work, because of the "$" in the string -//"The Bo$$ Day","10-16" -"National Boss Day","10-16" +"The Bo\$\$ [Dd]ay","10-16" +"National Boss [Dd]ay","10-16" // http://en.wikipedia.org/wiki/Independence_Day_%28US%29 -"Independence Day","07-04" +"Independence [Dd]ay","07-04" "Fourth of July","07-04" // http://en.wikipedia.org/wiki/Veterans_Day -"Veterans Day","11-11" -"Armistice Day","11-11" +"Veterans [Dd]ay","11-11" +"Armistice [Dd]ay","11-11" // http://en.wikipedia.org/wiki/Remembrance_Day -"Remembrance Day","11-11" -"Poppy Day","11-11" +"Remembrance [Dd]ay","11-11" +"Poppy [Dd]ay","11-11" // http://en.wikipedia.org/wiki/Epiphany_%28holiday%29 "Epiphany","02-06" "Theophany","02-06" // http://en.wikipedia.org/wiki/Hallowe%27en -"Halloween","10-31" -"All Hallows’ Evening","10-31" -"Hallowe'en","10-31" -"All Hallows' Eve","10-31" +"Hallowe'?en","10-31" +"All Hallows' [Ee]ve(?:ning)?","10-31" // http://en.wikipedia.org/wiki/Assumption_of_Mary "Assumption of Mary","08-15" @@ -71,69 +57,37 @@ "The Assumption","08-15" // http://en.wikipedia.org/wiki/Reformation_Day -"Reformation Day","10-31" +"Reformation [Dd]ay","10-31" // http://en.wikipedia.org/wiki/All_Saints%27_Day -"All Saints","11-01" -"All Saints' Day","11-01" +"All Saints'?(?: [Dd]ay)?","11-01" "Solemnity of All Saints","11-01" "All Hallows","11-01" "Hallowmas","11-01" // http://en.wikipedia.org/wiki/Christmas -"Christmas Eve","12-24" - -"Christmas","12-25" -"Christmas Day","12-25" -"Xmas","12-25" -"XMAS","12-25" +"Christmas [Ee]ve","12-24" +"Christmas(?: [Dd]ay)?","12-25" +"X-?(?:mas|MAS)","12-25" "Noel","12-25" "Yule","12-25" // http://en.wikipedia.org/wiki/New_Year%27s_Eve -"New Year's Eve","12-31" +"New Year'?s [Ee]ve","12-31" "Hogmanay","12-31" "Calennig","12-31" // http://en.wikipedia.org/wiki/May_Day -"May Day","05-01" -"International Workers Day","05-01" -"International Worker's Day","05-01" -"International Workers' Day","05-01" +"May [Dd]ay","05-01" +"International Worker'?s'? [Dd]ay","05-01" // http://en.wikipedia.org/wiki/Boxing_Day -"Boxing Day","12-26" +"Boxing [Dd]ay","12-26" // http://en.wikipedia.org/wiki/St._Patrick%27s_Day -"Saint Patricks Day","03-17" -"St. Patricks Day","03-17" -"Saint Patricks Day","03-17" -"St. Paddys Day","03-17" -"Saint Paddys Day","03-17" -"St. Pattys Day","03-17" -"Saint Pattys Day","03-17" - -"Saint Patrick's Day","03-17" -"St. Patrick's Day","03-17" -"Saint Patrick's Day","03-17" -"St. Paddy's Day","03-17" -"Saint Paddy's Day","03-17" -"St. Patty's Day","03-17" -"Saint Patty's Day","03-17" - -"Saint Patricks' Day","03-17" -"St. Patricks' Day","03-17" -"Saint Patricks' Day","03-17" -"St. Paddys' Day","03-17" -"Saint Paddys' Day","03-17" -"St. Pattys' Day","03-17" -"Saint Pattys' Day","03-17" +"(?:Saint|St\.) Pa(?:trick|ddy|tty)'?s'? [Dd]ay","03-17" // http://en.wikipedia.org/wiki/St._Andrew%27s_Day -"Saint Andrews Day","11-30" -"Saint Andrew's Day","11-30" -"Saint Andrews' Day","11-30" -"St. Andrews Day","11-30" -"St. Andrew's Day","11-30" -"St. Andrews' Day","11-30" +"(?:Saint|St\.) Andrew'?s'? [Dd]ay","11-30" + diff --git a/resources/english/normalization/resources_normalization_normHolidayVar.txt b/resources/english/normalization/resources_normalization_normHolidayVar.txt index 28422976..6acc3863 100644 --- a/resources/english/normalization/resources_normalization_normHolidayVar.txt +++ b/resources/english/normalization/resources_normalization_normHolidayVar.txt @@ -22,9 +22,9 @@ // http://en.wikipedia.org/wiki/Maundy_Thursday "Maundy Thursday","00-00 funcDateCalc(EasterSunday(YEAR, -3))" +"Great & Holy Thursday","00-00 funcDateCalc(EasterSunday(YEAR, -3))" "Holy Thursday","00-00 funcDateCalc(EasterSunday(YEAR, -3))" "Covenant Thursday","00-00 funcDateCalc(EasterSunday(YEAR, -3))" -"Great & Holy Thursday","00-00 funcDateCalc(EasterSunday(YEAR, -3))" "Thursday of Mysteries","00-00 funcDateCalc(EasterSunday(YEAR, -3))" // http://en.wikipedia.org/wiki/Good_Friday @@ -42,23 +42,17 @@ "Joyous Saturday","00-00 funcDateCalc(EasterSunday(YEAR, -1))" // http://en.wikipedia.org/wiki/Easter -"Easter","00-00 funcDateCalc(EasterSunday(YEAR, 0))" -"Easter Day","00-00 funcDateCalc(EasterSunday(YEAR, 0))" -"Easter Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 0))" -"Resurrection Day","00-00 funcDateCalc(EasterSunday(YEAR, 0))" -"Resurrection Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 0))" +"Easter(?: ?[Ss]unday| ?[Dd]ay)?","00-00 funcDateCalc(EasterSunday(YEAR, 0))" +"Resurrection(?: ?[Ss]unday| ?[Dd]ay)?","00-00 funcDateCalc(EasterSunday(YEAR, 0))" // http://en.wikipedia.org/wiki/Easter_Monday "Easter Monday","00-00 funcDateCalc(EasterSunday(YEAR, 1))" // http://en.wikipedia.org/wiki/Octave_of_Easter -"Octave of Easter","00-00 funcDateCalc(EasterSunday(YEAR, 7))" -"Octave Day of Easter","00-00 funcDateCalc(EasterSunday(YEAR, 7))" +"Octave(?: [Dd]ay)? of Easter","00-00 funcDateCalc(EasterSunday(YEAR, 7))" "Low Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 7))" -"Saint Thomas Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 7))" -"St. Thomas Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 7))" -"Quasimodo Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 7))" -"Quasimodogeniti","00-00 funcDateCalc(EasterSunday(YEAR, 7))" +"(?:Saint|St\.) Thomas Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 7))" +"Quasimodo(?: Sunday|geniti)","00-00 funcDateCalc(EasterSunday(YEAR, 7))" "Second Sunday of Easter","00-00 funcDateCalc(EasterSunday(YEAR, 7))" "Divine Mercy Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 7))" @@ -68,65 +62,55 @@ "Ascension Thursday","00-00 funcDateCalc(EasterSunday(YEAR, 39))" // http://en.wikipedia.org/wiki/Father%27s_Day -"Father's Day","06-00 funcDateCalc(EasterSunday(YEAR, 39))" +"Father'?s'? [Dd]ay","06-00 funcDateCalc(EasterSunday(YEAR, 39))" // http://en.wikipedia.org/wiki/Reformation_Day "Reformation Sunday","10-00 funcDateCalc(WeekdayRelativeTo(YEAR-10-31, 1, -1, true))" // http://en.wikipedia.org/wiki/Pentecost "Pentecost","00-00 funcDateCalc(EasterSunday(YEAR, 49))" -"Whit Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 49))" -"Whitsun","00-00 funcDateCalc(EasterSunday(YEAR, 49))" -"Whit","00-00 funcDateCalc(EasterSunday(YEAR, 49))" +"Whit(?:sun| ?[Ss]unday)?","00-00 funcDateCalc(EasterSunday(YEAR, 49))" // http://en.wikipedia.org/wiki/Corpus_Christi_%28feast%29 "Corpus Christi","00-00 funcDateCalc(EasterSunday(YEAR, 60))" "The Most Holy Body and Blood of Christ","00-00 funcDateCalc(EasterSunday(YEAR, 60))" // advent sundays: the four sundays before christmas(12-24) -"1. Advent","00-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -4, false))" -"1st Advent","00-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -4, false))" +"1(?:st|\.)? Advent","00-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -4, false))" "first Advent","00-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -4, false))" -"2. Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -3, false))" -"2nd Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -3, false))" +"2(?:nd|\.)? Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -3, false))" "second Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -3, false))" -"3. Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -2, false))" -"3rd Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -2, false))" +"3(?:rd|\.)? Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -2, false))" "third Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -2, false))" -"4. Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -1, false))" -"4th Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -1, false))" +"4(?:th|\.)? Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -1, false))" "fourth Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -1, false))" // http://en.wikipedia.org/wiki/Black_Friday_%28shopping%29 "Black Friday","11-00 funcDateCalc(WeekdayRelativeTo(YEAR-11-01, 6, 4, true))" // http://en.wikipedia.org/wiki/Martin_Luther_King,_Jr._Day -"Birthday of Martin Luther King, Jr.","01-00 funcDateCalc(WeekdayRelativeTo(YEAR-01-01, 2, 3, true))" -"Martin Luther King Day","01-00 funcDateCalc(WeekdayRelativeTo(YEAR-01-01, 2, 3, true))" -"Martin Luther King, Jr. Day","01-00 funcDateCalc(WeekdayRelativeTo(YEAR-01-01, 2, 3, true))" +"Birthday of Martin Luther King, Jr\.","01-00 funcDateCalc(WeekdayRelativeTo(YEAR-01-01, 2, 3, true))" +"Martin Luther King(?:, Jr\.)? [Dd]ay","01-00 funcDateCalc(WeekdayRelativeTo(YEAR-01-01, 2, 3, true))" // http://en.wikipedia.org/wiki/Washington%27s_Birthday -"Presidents Day","02-00 funcDateCalc(WeekdayRelativeTo(YEAR-02-01, 2, 3, true))" -"Presidents' Day","02-00 funcDateCalc(WeekdayRelativeTo(YEAR-02-01, 2, 3, true))" -"President's Day","02-00 funcDateCalc(WeekdayRelativeTo(YEAR-02-01, 2, 3, true))" +"President'?s'? [Dd]ay","02-00 funcDateCalc(WeekdayRelativeTo(YEAR-02-01, 2, 3, true))" "Washington's Birthday","02-00 funcDateCalc(WeekdayRelativeTo(YEAR-02-01, 2, 3, true))" // http://en.wikipedia.org/wiki/Memorial_Day -"Memorial Day","05-00 funcDateCalc(WeekdayRelativeTo(YEAR-06-01, 2, -1, false))" -"Decoration Day","05-00 funcDateCalc(WeekdayRelativeTo(YEAR-06-01, 2, -1, false))" +"Memorial [Dd]ay","05-00 funcDateCalc(WeekdayRelativeTo(YEAR-06-01, 2, -1, false))" +"Decoration [Dd]ay","05-00 funcDateCalc(WeekdayRelativeTo(YEAR-06-01, 2, -1, false))" // http://en.wikipedia.org/wiki/Labor_Day -"Labor Day","09-00 funcDateCalc(WeekdayRelativeTo(YEAR-09-01, 2, 1, true))" +"Labor [Dd]ay","09-00 funcDateCalc(WeekdayRelativeTo(YEAR-09-01, 2, 1, true))" // http://en.wikipedia.org/wiki/Columbus_Day -"Columbus Day","10-00 funcDateCalc(WeekdayRelativeTo(YEAR-10-01, 2, 2, true))" +"Columbus [Dd]ay","10-00 funcDateCalc(WeekdayRelativeTo(YEAR-10-01, 2, 2, true))" // http://en.wikipedia.org/wiki/Thanksgiving_%28United_States%29 -"Thanksgiving","11-00 funcDateCalc(WeekdayRelativeTo(YEAR-11-01, 5, 4, true))" -"Thanksgiving Day","11-00 funcDateCalc(WeekdayRelativeTo(YEAR-11-01, 5, 4, true))" +"Thanksgiving(?: [Dd]ay)?","11-00 funcDateCalc(WeekdayRelativeTo(YEAR-11-01, 5, 4, true))" // http://en.wikipedia.org/wiki/Mother%27s_Day -"Mother's Day","05-00 funcDateCalc(WeekdayRelativeTo(YEAR-05-01, 1, 2, true))" +"Mother'?s'? [Dd]ay","05-00 funcDateCalc(WeekdayRelativeTo(YEAR-05-01, 1, 2, true))" diff --git a/resources/english/normalization/resources_normalization_normPartOfYear.txt b/resources/english/normalization/resources_normalization_normPartOfYear.txt index 1edc8e19..420e3e67 100644 --- a/resources/english/normalization/resources_normalization_normPartOfYear.txt +++ b/resources/english/normalization/resources_normalization_normPartOfYear.txt @@ -6,19 +6,13 @@ // For example, the normalized value of "first quarter" is "Q1" // FORMAT: "parts-of-year-word","normalized-parts-of-year-word" // First Quarter -"([Ff]iscal(-| ))?([Ff]irst|1st)(-| )quarter","Q1" -"([Ff]iscal(-| ))?([Ss]econd|2nd)(-| )quarter","Q2" -"([Ff]iscal(-| ))?([Tt]hird|3rd)(-| )quarter","Q3" -"([Ff]iscal(-| ))?([Ff]ou?rth|4th)(-| )quarter","Q4" -"last-quarter","Q4" -"Last-quarter","Q4" -"last quarter","Q4" -"Last quarter","Q4" +"([Ff]iscal[ -])?([Ff]irst|1st)[ -]quarter","Q1" +"([Ff]iscal[ -])?([Ss]econd|2nd)[ -]quarter","Q2" +"([Ff]iscal[ -])?([Tt]hird|3rd)[ -]quarter","Q3" +"([Ff]iscal[ -])?([Ff]ou?rth|4th)[ -]quarter","Q4" +"[Ll]ast[ -]quarter","Q4" // First Half -"([Ff]iscal(-| ))?([Ff]irst|1st)(-| )half","H1" -"([Ff]iscal(-| ))?([Ss]econd|2nd)(-| )half","H2" +"([Ff]iscal[ -])?([Ff]irst|1st)[ -]half","H1" // Second Half -"last-half","H2" -"Last-half","H2" -"last half","H2" -"Last half","H2" +"([Ff]iscal[ -])?([Ss]econd|2nd)[ -]half","H2" +"[Ll]ast[ -]half","H2" diff --git a/resources/english/normalization/resources_normalization_normPartWords.txt b/resources/english/normalization/resources_normalization_normPartWords.txt index 3bc0b0bd..48c06f26 100644 --- a/resources/english/normalization/resources_normalization_normPartWords.txt +++ b/resources/english/normalization/resources_normalization_normPartWords.txt @@ -7,12 +7,20 @@ // FORMAT: "part word","normalized-part-word" "The middle of","MID" "the middle of","MID" +"Middle of","MID" +"middle of","MID" "The end of","END" "the end of","END" +"End of","END" +"end of","END" "The beginning of","START" "the beginning of","START" +"Beginning of","START" +"beginning of","START" "The start of","START" "the start of","START" +"Start of","START" +"start of","START" "Late","END" "late","END" "Later","END" @@ -28,4 +36,6 @@ "Fiscal-","" "fiscal-","" "Fiscal","" -"fiscal","" \ No newline at end of file +"fiscal","" +"Dawn of","START" +"dawn of","START" diff --git a/resources/english/normalization/resources_normalization_normUnit.txt b/resources/english/normalization/resources_normalization_normUnit.txt index 306692f0..5fbfd570 100644 --- a/resources/english/normalization/resources_normalization_normUnit.txt +++ b/resources/english/normalization/resources_normalization_normUnit.txt @@ -4,28 +4,17 @@ // This file contains "unit words" and their normalized expressions. // For example, the normalized value of "week" is "week" // FORMAT: "unit-word","normalized-unit-word" -"[Dd]ay","day" -"[Ww]eek-end","week-WE" -"[Ww]eekend","week-WE" -"[Ww]eek","week" -"[Mm]onth","month" -"[Qq]uarter","quarter" +"[Dd]ays?","day" +"[Ww]eek-?ends?","week-WE" +"[Ww]eeks?","week" +"[Mm]onths?","month" +"[Qq]uarters?","quarter" +"[Yy]ears?","year" "[Ff]iscal years?","year" -"[Yy]ear","year" -"[Dd]ecade","decade" +"[Dd]ecades?","decade" "[Cc]entury","century" -// Plurals (not in reUnit) -"[Dd]ays","day" -"[Ww]eek-ends","week-WE" -"[Ww]eekends","week-WE" -"[Ww]eeks","week" -"[Mm]onths","month" -"[Qq]uarters","quarter" -"[Yy]ears","year" -"[Dd]ecades","decade" -// not in reUnit "[Cc]enturies","century" +"[Tt]rading days?","day" "[Hh]ours?","hour" "[Mm]inutes?","minute" -"[Tt]rading days?","day" - +"[Ss]econds?","second" diff --git a/resources/english/normalization/resources_normalization_normWeekday.txt b/resources/english/normalization/resources_normalization_normWeekday.txt index 47ae295b..d7da1f6f 100644 --- a/resources/english/normalization/resources_normalization_normWeekday.txt +++ b/resources/english/normalization/resources_normalization_normWeekday.txt @@ -18,4 +18,4 @@ "Friday","friday" "Saturday","saturday" "Sunday","sunday" - +"[Ww]e[dn][nd]e?sday","wednesday" diff --git a/resources/english/normalization/resources_normalization_normYearPrefix.txt b/resources/english/normalization/resources_normalization_normYearPrefix.txt index 672c8800..f232435e 100644 --- a/resources/english/normalization/resources_normalization_normYearPrefix.txt +++ b/resources/english/normalization/resources_normalization_normYearPrefix.txt @@ -4,7 +4,11 @@ // This file contains "BCyears" and their normalized expressions. // For example, the normalized value of "BC" is "BC" // FORMAT: "yearPrefix","normalized-yearPrefix" -"B[\.]?C[\.]?","BC" -"A[\.]?D[\.]?","" +"BC","BC" +"AD","" +"B\.C\.","BC" +"A\.D\.","" "BCE","BC" -"CE","" \ No newline at end of file +"CE","" +"B\.C\.E\.","BC" +"C\.E\.","" \ No newline at end of file diff --git a/resources/english/repattern/resources_repattern_reAndOrTo.txt b/resources/english/repattern/resources_repattern_reAndOrTo.txt index 73f4ad21..ebac0311 100644 --- a/resources/english/repattern/resources_repattern_reAndOrTo.txt +++ b/resources/english/repattern/resources_repattern_reAndOrTo.txt @@ -3,9 +3,5 @@ // date: 2013-10-17 // This file contains regular expression patterns for "and", "or", "to" words. // FORMAT: one line is one disjunction of the pattern -[\s]?\–[\s]? -[\s]?-[\s]? -[\s]?–[\s]? - and - or - to \ No newline at end of file +\s(?:and|or|to)\s +\s?[/–‒‑-]\s? diff --git a/resources/english/repattern/resources_repattern_reApproximate.txt b/resources/english/repattern/resources_repattern_reApproximate.txt index bf98fd31..302d3d31 100644 --- a/resources/english/repattern/resources_repattern_reApproximate.txt +++ b/resources/english/repattern/resources_repattern_reApproximate.txt @@ -3,20 +3,11 @@ // date: 2011-06-10 // This file contains regular expression patterns for "approximate" words. // FORMAT: one line is one disjunction of the pattern -// about -[Aa]pproximately -[Aa]bout -[Aa]round +// 2016-01-13 Folded patterns by first letter for performance -- Erich +[Aa](?:bout|lmost|pproximately|round|t least) [Cc]irca -// less -[Nn]o more than -[Nn]o longer than -[Uu]p to -[Ll]ess than -[Nn]early -[Aa]lmost -// more -[Aa]t least +[Ll](?:ess|onger) than [Mm]ore than -[Ll]onger than +[Nn](?:early|o (?:more|longer) than) [Oo]ver +[Uu]p to diff --git a/resources/english/repattern/resources_repattern_reDateWord.txt b/resources/english/repattern/resources_repattern_reDateWord.txt index f44a1da7..f6a975e1 100644 --- a/resources/english/repattern/resources_repattern_reDateWord.txt +++ b/resources/english/repattern/resources_repattern_reDateWord.txt @@ -13,10 +13,8 @@ [Rr]ight now [Nn]ow [Aa]s soon as possible -[Rr]ecently -[Rr]ecent -[Cc]urrently -[Cc]urrent +[Rr]ecent(?:ly)? +[Cc]urrent(?:ly)? // [Ss]oon // [Pp]reviously // [Yy]et diff --git a/resources/english/repattern/resources_repattern_reDayNumber.txt b/resources/english/repattern/resources_repattern_reDayNumber.txt index f045823a..c5364d53 100644 --- a/resources/english/repattern/resources_repattern_reDayNumber.txt +++ b/resources/english/repattern/resources_repattern_reDayNumber.txt @@ -3,6 +3,8 @@ // date: 2011-06-10 // This file contains regular expression patterns for day numbers. // FORMAT: one line is one disjunction of the pattern -[12][0-9] +0[1-9] +1[0-9] +2[0-9] 3[01] -0?[1-9] \ No newline at end of file +[1-9] diff --git a/resources/english/repattern/resources_repattern_reDayNumberTh.txt b/resources/english/repattern/resources_repattern_reDayNumberTh.txt index 9465d9dc..153f0660 100644 --- a/resources/english/repattern/resources_repattern_reDayNumberTh.txt +++ b/resources/english/repattern/resources_repattern_reDayNumberTh.txt @@ -3,16 +3,19 @@ // date: 2011-06-10 // This file contains regular expression patterns for day digit th. // FORMAT: one line is one disjunction of the pattern -[123]0th -[23]?1st -[2]?2nd -[2]?3rd -[12]?4th -[12]?5th -[12]?6th -[12]?7th -[12]?8th -[12]?9th -11th -12th -13th \ No newline at end of file +1st +01st +2nd +02nd +3rd +03rd +[4-9]th +0[4-9]th +1[0-9]th +20th +21st +22nd +23rd +2[4-9]th +30th +31st diff --git a/resources/english/repattern/resources_repattern_reDayWordTh.txt b/resources/english/repattern/resources_repattern_reDayWordTh.txt index 2d5eae0b..cc813c39 100644 --- a/resources/english/repattern/resources_repattern_reDayWordTh.txt +++ b/resources/english/repattern/resources_repattern_reDayWordTh.txt @@ -3,6 +3,16 @@ // date: 2011-06-10 // This file contains regular expression patterns for day word th. // FORMAT: one line is one disjunction of the pattern +[Ff]irst +[Ss]econd +[Tt]hird +[Ff]ourth +[Ff]ifth +[Ss]ixth +[Ss]eventh +[Ee]ighth +[Nn]inth +// 10 [Tt]enth [Ee]leventh [Tt]welfth @@ -13,24 +23,9 @@ [Ss]eventeenth [Ee]ighteenth [Nn]ineteenth +// 20 [Tt]wentieth -[Tt]wenty-first -[Tt]wenty-second -[Tt]wenty-third -[Tt]wenty-fourth -[Tt]wenty-fifth -[Tt]wenty-sixth -[Tt]wenty-seventh -[Tt]wenty-eighth -[Tt]wenty-ninth +[Tt]wenty-(?:first|second|third|fourth|fifth|sixth|seventh|eighth|ninth) +// 30 [Tt]hirtieth [Tt]hirty-first -[Ff]irst -[Ss]econd -[Tt]hird -[Ff]ourth -[Ff]ifth -[Ss]ixth -[Ss]eventh -[Ee]ighth -[Nn]inth \ No newline at end of file diff --git a/resources/english/repattern/resources_repattern_reHolidayFix.txt b/resources/english/repattern/resources_repattern_reHolidayFix.txt index 0c0820a6..3d111591 100644 --- a/resources/english/repattern/resources_repattern_reHolidayFix.txt +++ b/resources/english/repattern/resources_repattern_reHolidayFix.txt @@ -5,62 +5,48 @@ // FORMAT: one line is one disjunction of the pattern // http://en.wikipedia.org/wiki/New_Year%27s_Day -New Year's Day +New Year'?s [Dd]ay // http://en.wikipedia.org/wiki/Groundhog_Day -Groundhog Day +Groundhog [Dd]ay // http://en.wikipedia.org/wiki/Valentine%27s_Day -Saint Valentines Day -St\. Valentines Day -Valentines Day -St\. Valentine's Day -Saint Valentine's Day -Valentine's Day -Saint Valentines' Day -St\. Valentines' Day -Valentines' Day +(?:Saint |St\. )?Valentine'?s'? [Dd]ay // http://en.wikipedia.org/wiki/Patriots%27_Day -Patriots Day -Patriots' Day -Patriot's Day +Patriot'?s'? [Dd]ay // http://en.wikipedia.org/wiki/German-American_Day -German-American Day +German-American [Dd]ay // http://en.wikipedia.org/wiki/White_Cane_Safety_Day -White Cane Safety Day +White Cane Safety [Dd]ay // http://en.wikipedia.org/wiki/Boss%27s_Day -Boss's Day -Bosses Day -Bosses' Day +Boss(?:'s|es|es') [Dd]ay //won't work, because of the "$" in the string -//The Bo$$ Day -National Boss Day +The Bo\$\$ [Dd]ay +National Boss [Dd]ay // http://en.wikipedia.org/wiki/Independence_Day_%28US%29 -Independence Day +Independence [Dd]ay Fourth of July // http://en.wikipedia.org/wiki/Veterans_Day -Veterans Day -Armistice Day +Veterans [Dd]ay +Armistice [Dd]ay // http://en.wikipedia.org/wiki/Remembrance_Day -Remembrance Day -Poppy Day +Remembrance [Dd]ay +Poppy [Dd]ay // http://en.wikipedia.org/wiki/Epiphany_%28holiday%29 Epiphany Theophany // http://en.wikipedia.org/wiki/Hallowe%27en -Halloween -All Hallows’ Evening -Hallowe'en -All Hallows' Eve +Hallowe'?en +All Hallows' [Ee]ve(?:ning)? // http://en.wikipedia.org/wiki/Assumption_of_Mary Assumption of Mary @@ -68,65 +54,35 @@ Assumption of the Blessed Virgin Mary into Heaven The Assumption // http://en.wikipedia.org/wiki/Reformation_Day -Reformation Day +Reformation [Dd]ay // http://en.wikipedia.org/wiki/All_Saints%27_Day -All Saints' Day -All Saints +All Saints'?(?: [Dd]ay)? Solemnity of All Saints All Hallows Hallowmas // http://en.wikipedia.org/wiki/Christmas -Christmas Eve -Christmas Day -Christmas -Xmas -XMAS +Christmas(?: [Ee]ve| [Dd]ay)? +X-?(?:mas|MAS) Noel Yule // http://en.wikipedia.org/wiki/New_Year%27s_Eve -New Year's Eve +New Year'?s [Ee]ve Hogmanay Calennig // http://en.wikipedia.org/wiki/May_Day -May Day -International Workers Day -International Worker's Day -International Workers' Day +May [Dd]ay +International Worker'?s'? [Dd]ay // http://en.wikipedia.org/wiki/Boxing_Day -Boxing Day +Boxing [Dd]ay // http://en.wikipedia.org/wiki/St._Patrick%27s_Day -Saint Patricks Day -St\. Patricks Day -St\. Paddys Day -Saint Paddys Day -St\. Pattys Day -Saint Pattys Day - -Saint Patrick's Day -St\. Patrick's Day -St\. Paddy's Day -Saint Paddy's Day -St\. Patty's Day -Saint Patty's Day - -Saint Patricks' Day -St\. Patricks' Day -St\. Paddys' Day -Saint Paddys' Day -St\. Pattys' Day -Saint Pattys' Day +(?:Saint|St\.) Pa(?:trick|ddy|tty)'?s'? [Dd]ay // http://en.wikipedia.org/wiki/St._Andrew%27s_Day -Saint Andrews Day -Saint Andrew's Day -Saint Andrews' Day -St\. Andrews Day -St\. Andrew's Day -St\. Andrews' Day +(?:Saint|St\.) Andrew'?s'? [Dd]ay diff --git a/resources/english/repattern/resources_repattern_reHolidayVar.txt b/resources/english/repattern/resources_repattern_reHolidayVar.txt index 56b970dd..d0b7af68 100644 --- a/resources/english/repattern/resources_repattern_reHolidayVar.txt +++ b/resources/english/repattern/resources_repattern_reHolidayVar.txt @@ -35,23 +35,17 @@ The Great Sabbath Joyous Saturday // http://en.wikipedia.org/wiki/Easter -Easter Sunday -Easter Day -Easter -Resurrection Day -Resurrection Sunday +Easter(?: ?[Ss]unday| ?[Dd]ay)? +Resurrection(?: ?[Ss]unday| ?[Dd]ay)? // http://en.wikipedia.org/wiki/Easter_Monday Easter Monday // http://en.wikipedia.org/wiki/Octave_of_Easter -Octave of Easter -Octave Day of Easter +Octave(?: [Dd]ay)? of Easter Low Sunday -Saint Thomas Sunday -St\. Thomas Sunday -Quasimodo Sunday -Quasimodogeniti +(?:Saint|St\.) Thomas Sunday +Quasimodo(?: Sunday|geniti) Second Sunday of Easter Divine Mercy Sunday @@ -61,36 +55,30 @@ Solemnity of the Ascension of the Lord Ascension Thursday // http://en.wikipedia.org/wiki/Father%27s_Day -Father's Day +Father'?s'? [Dd]ay // http://en.wikipedia.org/wiki/Reformation_Day Reformation Sunday // http://en.wikipedia.org/wiki/Pentecost Pentecost -Whit Sunday -Whitsun -Whit +Whit(?:sun| ?[Ss]unday)? // http://en.wikipedia.org/wiki/Corpus_Christi_%28feast%29 Corpus Christi The Most Holy Body and Blood of Christ // advent sundays: the four sundays before christmas(12-24) -1\. Advent -1st Advent +1(?:st|\.)? Advent first Advent -2\. Advent -2nd Advent +2(?:nd|\.)? Advent second Advent -3\. Advent -3rd Advent +3(?:rd|\.)? Advent third Advent -4\. Advent -4th Advent +4(?:th|\.)? Advent fourth Advent // http://en.wikipedia.org/wiki/Black_Friday_%28shopping%29 @@ -98,28 +86,24 @@ Black Friday // http://en.wikipedia.org/wiki/Martin_Luther_King,_Jr._Day Birthday of Martin Luther King, Jr\. -Martin Luther King Day -Martin Luther King, Jr\. Day +Martin Luther King(?:, Jr\.)? [Dd]ay // http://en.wikipedia.org/wiki/Washington%27s_Birthday -Presidents Day -Presidents' Day -President's Day +President'?s'? [Dd]ay Washington's Birthday // http://en.wikipedia.org/wiki/Memorial_Day -Memorial Day -Decoration Day +Memorial [Dd]ay +Decoration [Dd]ay // http://en.wikipedia.org/wiki/Labor_Day -Labor Day +Labor [Dd]ay // http://en.wikipedia.org/wiki/Columbus_Day -Columbus Day +Columbus [Dd]ay // http://en.wikipedia.org/wiki/Thanksgiving_%28United_States%29 -Thanksgiving Day -Thanksgiving +Thanksgiving(?: [Dd]ay)? // http://en.wikipedia.org/wiki/Mother%27s_Day -Mother's Day +Mother'?s'? [Dd]ay diff --git a/resources/english/repattern/resources_repattern_reMonthLong.txt b/resources/english/repattern/resources_repattern_reMonthLong.txt index 39c76d84..b8e80c1d 100644 --- a/resources/english/repattern/resources_repattern_reMonthLong.txt +++ b/resources/english/repattern/resources_repattern_reMonthLong.txt @@ -3,14 +3,11 @@ // date: 2011-06-10 // This file contains regular expression patterns for long months. // FORMAT: one line is one disjunction of the pattern -[Jj]anuary +// 2016-01-13 Folded patterns by first letter for performance -- Erich +[Jj](?:anuary|une|uly) [Ff]ebruary -[Mm]arch -[Aa]pril -[Mm]ay -[Jj]une -[Jj]uly -[Aa]ugust +[Mm](?:arch|ay) +[Aa](?:pril|ugust) [Ss]eptember [Oo]ctober [Nn]ovember diff --git a/resources/english/repattern/resources_repattern_reMonthNumber.txt b/resources/english/repattern/resources_repattern_reMonthNumber.txt index 3435e188..6e419f5d 100644 --- a/resources/english/repattern/resources_repattern_reMonthNumber.txt +++ b/resources/english/repattern/resources_repattern_reMonthNumber.txt @@ -3,7 +3,5 @@ // date: 2011-06-10 // This file contains regular expression patterns for month numbers. // FORMAT: one line is one disjunction of the pattern -10 -11 -12 +1[0-2] 0?[1-9] \ No newline at end of file diff --git a/resources/english/repattern/resources_repattern_reMonthShort.txt b/resources/english/repattern/resources_repattern_reMonthShort.txt index 513bf92a..073ff89f 100644 --- a/resources/english/repattern/resources_repattern_reMonthShort.txt +++ b/resources/english/repattern/resources_repattern_reMonthShort.txt @@ -3,53 +3,17 @@ // date: 2011-06-10 // This file contains regular expression patterns for short months. // FORMAT: one line is one disjunction of the pattern -[Jj]an\. -[Jj]an -[Ff]eb\. -[Ff]eb -[Mm]ar\. -[Mm]ar -[Aa]pr\. -[Aa]pr -[Mm]ay -[Jj]un\. -[Jj]un -[Jj]ul\. -[Jj]ul -[Aa]ug\. -[Aa]ug -[Ss]ep\. -[Ss]ep -[Ss]ept\. -[Ss]ept -[Oo]ct\. -[Oo]ct -[Nn]ov\. -[Nn]ov -[Dd]ec\. -[Dd]ec -JAN\. -JAN -FEB\. -FEB -MAR\. -MAR -APR\. -APR -MAY -JUN\. -JUN -JUL\. -JUL -AUG\. -AUG -SEP\. -SEP -SEPT\. -SEPT -OCT\. -OCT -NOV\. -NOV -DEC\. -DEC \ No newline at end of file +// 2016-05-09: note that \.? causes problems with the pattern optimizer. Use (?:\.|) instead. -- Erich +(?:jan|Jan|JAN)(?:\.|) +(?:feb|Feb|FEB)(?:\.|) +(?:mar|Mar|MAR)(?:\.|) +(?:apr|Apr|APR)(?:\.|) +// May (only three letters, no dot) +(?:may|May|MAY) +(?:jun|Jun|JUN)(?:\.|) +(?:jul|Jul|JUL)(?:\.|) +(?:aug|Aug|AUG)(?:\.|) +(?:sep|Sep|SEP)(?:\.|) +(?:oct|Oct|OCT)(?:\.|) +(?:nov|Nov|NOV)(?:\.|) +(?:dec|Dec|DEC)(?:\.|) \ No newline at end of file diff --git a/resources/english/repattern/resources_repattern_reNumWord2D.txt b/resources/english/repattern/resources_repattern_reNumWord2D.txt index 75bfc7d6..f6d07582 100644 --- a/resources/english/repattern/resources_repattern_reNumWord2D.txt +++ b/resources/english/repattern/resources_repattern_reNumWord2D.txt @@ -14,83 +14,20 @@ [Ss]eventeen [Ee]ighteen [Nn]ineteen -[Tt]wenty[ -]one -[Tt]hirty[ -]one -[Ff]orty[ -]one -[Ff]ifty[ -]one -[Ss]ixty[ -]one -[Ss]eventy[ -]one -[Ee]ighty[ -]one -[Nn]inety[ -]one -[Tt]wenty[ -]two -[Tt]hirty[ -]two -[Ff]orty[ -]two -[Ff]ifty[ -]two -[Ss]ixty[ -]two -[Ss]eventy[ -]two -[Ee]ighty[ -]two -[Nn]inety[ -]two -[Tt]wenty[ -]three -[Tt]hirty[ -]three -[Ff]orty[ -]three -[Ff]ifty[ -]three -[Ss]ixty[ -]three -[Ss]eventy[ -]three -[Ee]ighty[ -]three -[Nn]inety[ -]three -[Tt]wenty[ -]four -[Tt]hirty[ -]four -[Ff]orty[ -]four -[Ff]ifty[ -]four -[Ss]ixty[ -]four -[Ss]eventy[ -]four -[Ee]ighty[ -]four -[Nn]inety[ -]four -[Tt]wenty[ -]five -[Tt]hirty[ -]five -[Ff]orty[ -]five -[Ff]ifty[ -]five -[Ss]ixty[ -]five -[Ss]eventy[ -]five -[Ee]ighty[ -]five -[Nn]inety[ -]five -[Tt]wenty[ -]six -[Tt]hirty[ -]six -[Ff]orty[ -]six -[Ff]ifty[ -]six -[Ss]ixty[ -]six -[Ss]eventy[ -]six -[Ee]ighty[ -]six -[Nn]inety[ -]six -[Tt]wenty[ -]seven -[Tt]hirty[ -]seven -[Ff]orty[ -]seven -[Ff]ifty[ -]seven -[Ss]ixty[ -]seven -[Ss]eventy[ -]seven -[Ee]ighty[ -]seven -[Nn]inety[ -]seven -[Tt]wenty[ -]eight -[Tt]hirty[ -]eight -[Ff]orty[ -]eight -[Ff]ifty[ -]eight -[Ss]ixty[ -]eight -[Ss]eventy[ -]eight -[Ee]ighty[ -]eight -[Nn]inety[ -]eight -[Tt]wenty[ -]nine -[Tt]hirty[ -]nine -[Ff]orty[ -]nine -[Ff]ifty[ -]nine -[Ss]ixty[ -]nine -[Ss]eventy[ -]nine -[Ee]ighty[ -]nine -[Nn]inety[ -]nine +// [Tt]wenty +[Tt]wenty[ -]?(?:one|two|three|four|five|six|seven|eight|nine) [Tt]hirty +[Tt]hirty[ -]?(?:one|two|three|four|five|six|seven|eight|nine) [Ff]orty +[Ff]orty[ -]?(?:one|two|three|four|five|six|seven|eight|nine) [Ff]ifty +[Ff]ifty[ -]?(?:one|two|three|four|five|six|seven|eight|nine) [Ss]ixty +[Ss]ixty[ -]?(?:one|two|three|four|five|six|seven|eight|nine) [Ss]eventy +[Ss]eventy[ -]?(?:one|two|three|four|five|six|seven|eight|nine) [Ee]ighty +[Ee]ighty[ -]?(?:one|two|three|four|five|six|seven|eight|nine) [Nn]inety +[Nn]inety[ -]?(?:one|two|three|four|five|six|seven|eight|nine) diff --git a/resources/english/repattern/resources_repattern_reNumWordTeen.txt b/resources/english/repattern/resources_repattern_reNumWordTeen.txt index 1706b070..e7e1bb49 100644 --- a/resources/english/repattern/resources_repattern_reNumWordTeen.txt +++ b/resources/english/repattern/resources_repattern_reNumWordTeen.txt @@ -3,14 +3,14 @@ // date: 2011-06-10 // This file contains regular expression patterns for number words 10 to 20. // FORMAT: one line is one disjunction of the pattern -ten -eleven -twelve -thirteen -fourteen -fifteen -sixteen -seventeen -eighteen +// 2016-01-13 Folded patterns by first letter for performance -- Erich +// 10, 12, 13, 20 +t(?:en|welve|hirteen|wenty) +// 11, 18 +e(?:leven|ighteen) +// 14, 15 +f(?:ourteen|ifteen) +// 16, 17 +s(?:ixteen|eventeen) +// 19 nineteen -twenty \ No newline at end of file diff --git a/resources/english/repattern/resources_repattern_rePartOfDay.txt b/resources/english/repattern/resources_repattern_rePartOfDay.txt index f65c42ea..ad4ac036 100644 --- a/resources/english/repattern/resources_repattern_rePartOfDay.txt +++ b/resources/english/repattern/resources_repattern_rePartOfDay.txt @@ -3,12 +3,9 @@ // date: 2011-06-10 // This file contains regular expression patterns for parts of days. // FORMAT: one line is one disjunction of the pattern -[Mm]id-afternoon +// 2016-01-13 Folded patterns by first letter for performance -- Erich +[Mm](?:orning|id-afternoon|idnight|id-day) +[Nn](?:oon|ight) [Aa]fternoon -[Nn]oon -[Mm]idnight -[Mm]id-day -[Nn]ight [Tt]onight -[Mm]orning [Ee]vening \ No newline at end of file diff --git a/resources/english/repattern/resources_repattern_rePartOfYear.txt b/resources/english/repattern/resources_repattern_rePartOfYear.txt index ee2ff593..1ee98b63 100644 --- a/resources/english/repattern/resources_repattern_rePartOfYear.txt +++ b/resources/english/repattern/resources_repattern_rePartOfYear.txt @@ -3,61 +3,13 @@ // date: 2011-06-10 // This file contains regular expression patterns for parts of year. // FORMAT: one line is one disjunction of the pattern -[Ff]iscal-first quarter -[Ff]iscal-first half -[Ff]iscal-first-quarter -[Ff]iscal-first-half -[Ff]iscal-second quarter -[Ff]iscal-second half -[Ff]iscal-second-quarter -[Ff]iscal-second-half -[Ff]iscal-third quarter -[Ff]iscal-third half -[Ff]iscal-third-quarter -[Ff]iscal-third-half -[Ff]iscal-fourth quarter -[Ff]iscal-fourth half -[Ff]iscal-fourth-quarter -[Ff]iscal-fourth-half -[Ff]iscal-forth quarter -[Ff]iscal-forth half -[Ff]iscal-forth-quarter -[Ff]iscal-forth-half -[Ff]iscal first quarter -[Ff]iscal first half -[Ff]iscal first-quarter -[Ff]iscal first-half -[Ff]iscal second quarter -[Ff]iscal second half -[Ff]iscal second-quarter -[Ff]iscal second-half -[Ff]iscal third quarter -[Ff]iscal third-quarter -[Ff]iscal fourth quarter -[Ff]iscal fourth-quarter -[Ff]iscal forth quarter -[Ff]iscal forth-quarter -[Ff]irst quarter -[Ff]irst half -[Ff]irst-quarter -[Ff]irst-half -[Ss]econd quarter -[Ss]econd half -[Ss]econd-quarter -[Ss]econd-half -[Tt]hird quarter -[Tt]hird-quarter -[Ff]ourth quarter -[Ff]ourth-quarter -[Ff]orth quarter -[Ff]orth-quarter -[Ll]ast quarter -[Ll]ast half -[Ll]ast-quarter -[Ll]ast-half -1st quarter -2nd quarter -3rd quarter -4th quarter -1st half -2nd half \ No newline at end of file +[Ff]iscal[ -](?:first|second|third|fou?rth)[ -](?:half|quarter) +[Ff]irst[ -](?:half|quarter) +[Ss]econd[ -](?:half|quarter) +[Tt]hird[ -]quarter +[Ff]ou?rth[ -]quarter +[Ll]ast[ -](?:half|quarter) +1st[ -](?:half|quarter) +2nd[ -](?:half|quarter) +3rd[ -]quarter +4th[ -]quarter diff --git a/resources/english/repattern/resources_repattern_rePartWords.txt b/resources/english/repattern/resources_repattern_rePartWords.txt index 29253adf..bc34d850 100644 --- a/resources/english/repattern/resources_repattern_rePartWords.txt +++ b/resources/english/repattern/resources_repattern_rePartWords.txt @@ -3,15 +3,12 @@ // date: 2011-06-10 // This file contains regular expression patterns for "part" words. // FORMAT: one line is one disjunction of the pattern -[Tt]he middle of -[Tt]he end of -[Tt]he beginning of -[Tt]he start of -[Ll]ate -[Ll]ater -[Ee]arly -[Ee]arlier -[Mm]id- -[Mm]id -[Ff]iscal- -[Ff]iscal \ No newline at end of file +[Tt]he (?:start|beginning|middle|end) of +[Ss]tart of +[Bb]eginning of +[Mm]iddle of +[Ee]nd of +[Ll]ater? +[Ee]arl(?:y|ier) +[Mm]id-? +[Ff]iscal-? \ No newline at end of file diff --git a/resources/english/repattern/resources_repattern_reSeason.txt b/resources/english/repattern/resources_repattern_reSeason.txt index 5f079816..d65181f1 100644 --- a/resources/english/repattern/resources_repattern_reSeason.txt +++ b/resources/english/repattern/resources_repattern_reSeason.txt @@ -3,8 +3,7 @@ // date: 2011-06-10 // This file contains regular expression patterns for seasons. // FORMAT: one line is one disjunction of the pattern -[Ss]pring -[Ss]ummer +[Ss](?:pring|ummer) [Ff]all [Aa]utumn [Ww]inter \ No newline at end of file diff --git a/resources/english/repattern/resources_repattern_reTimeHour.txt b/resources/english/repattern/resources_repattern_reTimeHour.txt index f9795cef..d6c1d19c 100644 --- a/resources/english/repattern/resources_repattern_reTimeHour.txt +++ b/resources/english/repattern/resources_repattern_reTimeHour.txt @@ -3,19 +3,6 @@ // date: 2011-06-10 // This file contains regular expression patterns for time hours. // FORMAT: one line is one disjunction of the pattern -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 -21 -22 -23 -24 +1[0-9] +2[0-4] 0?[0-9] \ No newline at end of file diff --git a/resources/english/repattern/resources_repattern_reTimeMinute.txt b/resources/english/repattern/resources_repattern_reTimeMinute.txt index 6e560f7a..40572855 100644 --- a/resources/english/repattern/resources_repattern_reTimeMinute.txt +++ b/resources/english/repattern/resources_repattern_reTimeMinute.txt @@ -3,4 +3,4 @@ // date: 2011-06-10 // This file contains regular expression patterns for time minutes. // FORMAT: one line is one disjunction of the pattern -[0|1|2|3|4|5][0-9] \ No newline at end of file +[0-5][0-9] \ No newline at end of file diff --git a/resources/english/repattern/resources_repattern_reTimezone.txt b/resources/english/repattern/resources_repattern_reTimezone.txt index 6864e1a7..201eb1a6 100644 --- a/resources/english/repattern/resources_repattern_reTimezone.txt +++ b/resources/english/repattern/resources_repattern_reTimezone.txt @@ -3,6 +3,12 @@ // date: 2011-06-10 // This file contains regular expression patterns for time zones. // FORMAT: one line is one disjunction of the pattern -EST -EDT +// 2016-01-13 Added more (important) time zones -- Erich GMT +UTC +// Important US time zones (Omitted Alaska) +[EPCMH][SD]T +// Central Europe +CE[SD]?T(?: DST)? +// Japan +JST \ No newline at end of file diff --git a/resources/english/repattern/resources_repattern_reUnit.txt b/resources/english/repattern/resources_repattern_reUnit.txt index d9049720..1df562c9 100644 --- a/resources/english/repattern/resources_repattern_reUnit.txt +++ b/resources/english/repattern/resources_repattern_reUnit.txt @@ -3,16 +3,13 @@ // date: 2011-06-10 // This file contains regular expression patterns for unit words. // FORMAT: one line is one disjunction of the pattern -[Tt]rading days? [Dd]ays? -[Ww]eek-ends? -[Ww]eekends? -[Ww]eeks? +[Ww]eek(?:-?end)?s? [Mm]onths? [Qq]uarters? [Ff]iscal years? [Yy]ears? [Dd]ecades? -[Cc]entury -[Cc]enturies +[Cc]entur(?:y|ies) +[Tt]rading days? diff --git a/resources/english/repattern/resources_repattern_reUnitFine.txt b/resources/english/repattern/resources_repattern_reUnitFine.txt new file mode 100644 index 00000000..5b843762 --- /dev/null +++ b/resources/english/repattern/resources_repattern_reUnitFine.txt @@ -0,0 +1,17 @@ +// author: Jannik Strötgen +// email: stroetgen@uni-hd.de +// date: 2011-06-10 +// This file contains regular expression patterns for unit words. +// FORMAT: one line is one disjunction of the pattern +[Hh]ours? +[Mm]inutes? +[Ss]econds? +[Dd]ays? +[Ww]eek(?:-?end)?s? +[Mm]onths? +[Qq]uarters? +[Ff]iscal years? +[Yy]ears? +[Dd]ecades? +[Cc]entur(?:y|ies) +[Tt]rading days? diff --git a/resources/english/repattern/resources_repattern_reWeekday.txt b/resources/english/repattern/resources_repattern_reWeekday.txt index 5218f98d..069cba47 100644 --- a/resources/english/repattern/resources_repattern_reWeekday.txt +++ b/resources/english/repattern/resources_repattern_reWeekday.txt @@ -3,10 +3,12 @@ // date: 2011-06-10 // This file contains regular expression patterns for weekdays. // FORMAT: one line is one disjunction of the pattern +// 2016-01-10 Allow some spelling mistakes of Wednesday -- Erich +// 2016-01-13 Folded patterns by first letter for performance -- Erich [Mm]onday [Tt]uesday -[Ww]ednesday +// Allow spelling errors: +[Ww]e[dn][dn]e?sday [Tt]hursday [Ff]riday -[Ss]aturday -[Ss]unday \ No newline at end of file +[Ss](?:aturday|unday) diff --git a/resources/english/repattern/resources_repattern_reYear2Digit.txt b/resources/english/repattern/resources_repattern_reYear2Digit.txt index d0ed65e6..4bb18e18 100644 --- a/resources/english/repattern/resources_repattern_reYear2Digit.txt +++ b/resources/english/repattern/resources_repattern_reYear2Digit.txt @@ -3,4 +3,4 @@ // date: 2011-06-10 // This file contains regular expression patterns for year numbers (2 digit). // FORMAT: one line is one disjunction of the pattern -\d\d \ No newline at end of file +[0-9][0-9] \ No newline at end of file diff --git a/resources/english/repattern/resources_repattern_reYear4Digit.txt b/resources/english/repattern/resources_repattern_reYear4Digit.txt index 99ae2548..30615d13 100644 --- a/resources/english/repattern/resources_repattern_reYear4Digit.txt +++ b/resources/english/repattern/resources_repattern_reYear4Digit.txt @@ -3,4 +3,8 @@ // date: 2011-06-10 // This file contains regular expression patterns for year numbers (4 digits). // FORMAT: one line is one disjunction of the pattern -[12]\d\d\d \ No newline at end of file +1[0-9][0-9][0-9] +// Be conservative with future dates beyond 2100: +20[0-9][0-9] +2100 +2200 diff --git a/resources/english/repattern/resources_repattern_reYearBC.txt b/resources/english/repattern/resources_repattern_reYearBC.txt index 68a4665c..cdfd1da9 100644 --- a/resources/english/repattern/resources_repattern_reYearBC.txt +++ b/resources/english/repattern/resources_repattern_reYearBC.txt @@ -3,7 +3,4 @@ // date: 2011-06-10 // This file contains regular expression patterns for year numbers (4 digits). // FORMAT: one line is one disjunction of the pattern -\d\d\d\d -\d\d\d -\d\d -\d +[0-9][0-9]?[0-9]?[0-9]? diff --git a/resources/english/repattern/resources_repattern_reYearPrefix.txt b/resources/english/repattern/resources_repattern_reYearPrefix.txt index 1964fee0..89ff7145 100644 --- a/resources/english/repattern/resources_repattern_reYearPrefix.txt +++ b/resources/english/repattern/resources_repattern_reYearPrefix.txt @@ -3,11 +3,11 @@ // date: 2011-06-10 // This file contains regular expression patterns for year numbers (4 digits). // FORMAT: one line is one disjunction of the pattern -BC -B[\.]C[\.] -B[\.]C +A\.D\. AD -A[\.]D[\.] -A[\.]D +B\.C\. +BC +B\.C\.E\. BCE +C\.E\. CE \ No newline at end of file diff --git a/resources/english/rules/resources_rules_daterules.txt b/resources/english/rules/resources_rules_daterules.txt index f8b79722..a8dfc7bf 100644 --- a/resources/english/rules/resources_rules_daterules.txt +++ b/resources/english/rules/resources_rules_daterules.txt @@ -4,93 +4,6 @@ // This file contains rules for the temporal expressions of the type date: daterules // RULENAME="",EXTRACTION="",NORM_VALUE=""(,OFFSET="")?(,POS_CONSTRAINT="")?(,NORM_MOD="")?(,NORM_QUANT="")?(,NORM_FREQ="")? -// Note: rule with "-BCADhint" in the rule name contain explicit BC or AD information. -// This information is important during the normalization process. - -/////////////////// -// History RULES // -/////////////////// - -// historic dates; year granularity; with explicit AD / BC hints -// EXAMPLE historic_1a-BCADhint: 190 BC (1- to 4-digit year) -// EXAMPLE historic_1b-BCADhint: BC 190 (1- to 4-digit year) -// EXAMPLE historic_1c-BCADhint: 190 or 180 BC (find "190 BC"; 1- to 4-digit year) -RULENAME="date_historic_1a-BCADhint",EXTRACTION="(%reApproximate )?%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="date_historic_1b-BCADhint",EXTRACTION="(%reApproximate )?%reYearPrefix %reYearBC",NORM_VALUE="%normYearPrefix(group(3))%normYearBC(group(4))",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="x_date_historic_1c-BCADhint",EXTRACTION="(%reApproximate )?%reYearBC%reAndOrTo%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(6))%normYearBC(group(3))",NORM_MOD="%normApprox4Dates(group(2))",OFFSET="group(0)-group(3)" - -// historic dates; month granularity -// EXAMPLE historic_2a-BCADhint: March 190 BC (1- to 4-digit year) -// EXAMPLE historic_2b: March 190 (3-digit year) -// EXAMPLE historic_2c: (in) March 90 (2-digit year) -// EXAMPLE historic_2d: March of 90 (2-digit year) -RULENAME="date_historic_2a-BCADhint",EXTRACTION="(%reApproximate )?(%reMonthLong|%reMonthShort)( of | )%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(8))%normYearBC(group(7))-%normMonth(group(3))",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="date_historic_2b",EXTRACTION="(%reApproximate )?(%reMonthLong|%reMonthShort)( of | )([\d][\d][\d])",NORM_VALUE="%normYearBC(group(7))-%normMonth(group(3))",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="date_historic_2c",EXTRACTION="([Ii]n )(%reMonthLong|%reMonthShort)( of | )%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(6)-%normMonth(group(2))",OFFSET="group(2)-group(6)" -RULENAME="date_historic_2d",EXTRACTION="(%reMonthLong|%reMonthShort)( of )%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(5)-%normMonth(group(1))" - -// historic dates; day granularity -// EXAMPLE historic_3a-BCADhint: March 29, 190 BC (1- to 4-digit year) -// EXAMPLE historic_3b-BCADhint: 29 March 190 BC (1- to 4-digit year) -// EXAMPLE historic_3c-BCADhint: 29th of March 190 BC (1- to 4-digit year) -// EXAMPLE historic_3d-BCADhint: March 29, 190 (3-digit year) -// EXAMPLE historic_3e-BCADhint: March 29, 90 (2-digit year) -RULENAME="date_historic_3a-BCADhint",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayNumberTh|%reDayNumber)[\s]?,? %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(8))%normYearBC(group(7))-%normMonth(group(1))-%normDay(group(4))" -RULENAME="date_historic_3b-BCADhint",EXTRACTION="%reDayNumber (%reMonthLong|%reMonthShort)([\s]?,)? %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(7))%normYearBC(group(6))-%normMonth(group(2))-%normDay(group(1))" -RULENAME="date_historic_3c-BCADhint",EXTRACTION="(%reDayWordTh|%reDayNumberTh|%reDayNumber) (of) (%reMonthLong|%reMonthShort) %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(10))%normYearBC(group(9))-%normMonth(group(6))-%normDay(group(1))" -RULENAME="date_historic_3d",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayNumberTh|%reDayNumber)[\s]?,? ([\d\d\d])",NORM_VALUE="%normYearBC(group(7))-%normMonth(group(1))-%normDay(group(4))" -RULENAME="date_historic_3e",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayNumberTh|%reDayNumber)(,)? ([\d\d])",NORM_VALUE="UNDEF-centurygroup(8)-%normMonth(group(1))-%normDay(group(4))" - -// historic dates; season granularity -// EXAMPLE historic_4a-BCADhint: summer of 190 BC (1- to 4-digit year) -RULENAME="date_historic_4a-BCADhint",EXTRACTION="(%reApproximate )?(the )?%reSeason( of | )%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(7))%normYearBC(group(6))-%normSeason(group(4))",NORM_MOD="%normApprox4Dates(group(2))" - -// historic dates; century granularity -// EXAMPLE date_historic_5a-BCADhint: the 2nd century BC -// EXAMPLE date_historic_5b-BCADhint: beginning of the 2nd century BC -// EXAMPLE date_historic_5c-BCADhint: 2nd or 3rd century BC (find "2nd century BC") -// EXAMPLE date_historic_5d-BCADhint: beginning of the 2nd or 3rd century BC (find "beginning 2nd century BC") -RULENAME="date_historic_5a-BCADhint",EXTRACTION="([Tt]he )?(%reDayNumberTh|%reDayWordTh) [Cc]entur(y|ies)( %reYearPrefix)",NORM_VALUE="%normYearPrefix(group(7))%normDay(%SUM%(%normDay(group(2)),-1))" -RULENAME="date_historic_5b-BCADhint",EXTRACTION="%rePartWords( the)? (%reDayNumberTh|%reDayWordTh) [Cc]entur(y|ies)( %reYearPrefix)",NORM_VALUE="%normYearPrefix(group(8))%normDay(%SUM%(%normDay(group(3)),-1))",NORM_MOD="%normPartWords(group(1))" -RULENAME="date_historic_5c-BCADhint",EXTRACTION="(([Tt]he )?(%reDayNumberTh|%reDayWordTh))%reAndOrTo(the )?(%reDayNumberTh|%reDayWordTh) [Cc]entur(y|ies)( %reYearPrefix)",NORM_VALUE="%normYearPrefix(group(13))%normDay(%SUM%(%normDay(group(3)),-1))",OFFSET="group(1)-group(1)" -RULENAME="date_historic_5d-BCADhint",EXTRACTION="(%rePartWords( the)? (%reDayNumberTh|%reDayWordTh))%reAndOrTo(the )?(%reDayNumberTh|%reDayWordTh) [Cc]entur(y|ies)( %reYearPrefix)",NORM_VALUE="%normYearPrefix(group(14))%normDay(%SUM%(%normDay(group(4)),-1))",OFFSET="group(1)-group(1)",NORM_MOD="%normPartWords(group(2))" - -// historic dates; decade granularity -// EXAMPLE date_historic_6a-BCADhint: 1990s BC -// EXAMPLE date_historic_6b-BCADhint: 190s BC -// EXAMPLE date_historic_6c-BCADhint: 90s BC -RULENAME="date_historic_6a-BCADhint",EXTRACTION="(%rePartWords )?([Tt]he )?(\d\d\d0)[']?[s] %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))%SUBSTRING%(group(4),0,3)",NORM_MOD="%normPartWords(group(2))" -RULENAME="date_historic_6b-BCADhint",EXTRACTION="(%rePartWords )?([Tt]he )?(\d\d0)[']?[s] %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))0%SUBSTRING%(group(4),0,2)",NORM_MOD="%normPartWords(group(2))" -RULENAME="date_historic_6c-BCADhint",EXTRACTION="(%rePartWords )?([Tt]he )?(\d0)[']?[s] %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))00%SUBSTRING%(group(4),0,1)",NORM_MOD="%normPartWords(group(2))" - -// historic dates; year granularity (no AD / BC hints) -// EXAMPLE date_historic_7a: (in) 190 (3-digit year) -// EXAMPLE date_historic_7b: (in) 190(,) (3-digit year) -// EXAMPLE date_historic_7c: (newline)190(newline) (2- to 4-digit year) -// EXAMPLE date_historic_7d: year of 90 (2-digit year) -// EXAMPLE date_historic_7e: year of 190 (3-digit year) -RULENAME="date_historic_7a",EXTRACTION="(\b[Ii]n) (\d\d\d)",NORM_VALUE="%normYearBC(group(2))",OFFSET="group(2)-group(2)" -RULENAME="date_historic_7b",EXTRACTION="(\b[Ii]n) (\d\d\d)(,)",NORM_VALUE="%normYearBC(group(2))",OFFSET="group(2)-group(2)" -RULENAME="date_historic_7c",EXTRACTION="\A(\d\d[\d]?[\d]?)\Z",NORM_VALUE="%normYearBC(group(1))" -RULENAME="date_historic_7d",EXTRACTION="([Tt]he )?(year) (of) (\d\d)",NORM_VALUE="UNDEF-centurygroup(4)" -RULENAME="date_historic_7e",EXTRACTION="([Tt]he )?(year) (of) (\d\d\d)",NORM_VALUE="%normYearBC(group(4))" - -// historic dates; 2-digit year granularity (no AD / BC hints) -// EXAMPLE date_historic_8a: (in) 90(,) (2-digit year) -// EXAMPLE date_historic_8b: (in) 90 (2-digit year) -RULENAME="date_historic_8a",EXTRACTION="(\b[Ii]n) (\d\d)(,)",NORM_VALUE="UNDEF-centurygroup(2)",OFFSET="group(2)-group(2)" -RULENAME="date_historic_8b",EXTRACTION="(\b[Ii]n) (\d\d)",NORM_VALUE="UNDEF-centurygroup(2)",OFFSET="group(2)-group(2)" - -// historic dates; negative rules -// EXAMPLE date_historic_0a: in 90 cases (2- to 4-digit year) -// EXAMPLE date_historic_0b: in 90 nice cases (2- to 4-digit year) -// EXAMPLE date_historic_0c: in 90 nice law cases (2- to 4-digit year) -// EXAMPLE date_historic_0d: in 90 percent (2- to 4-digit year) -RULENAME="date_historic_0a_negative",EXTRACTION="(\b[Ii]n) (%reYearBC )([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(4):NNS:",OFFSET="group(2)-group(2)" -RULENAME="date_historic_0b_negative",EXTRACTION="(\b[Ii]n) (%reYearBC )([\S]+) ([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(4):JJ:group(5):NNS:",OFFSET="group(2)-group(2)" -RULENAME="date_historic_0c_negative",EXTRACTION="(\b[Ii]n) (%reYearBC )([\S]+) ([\S]+) ([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(4):JJ:group(5):NN:group(6):NNS:",OFFSET="group(2)-group(2)" -RULENAME="date_historic_0d_negative",EXTRACTION="(\b[Ii]n) (%reYearBC )(kilometers?|miles?|foot|feet|dollars?|percents?|millions?|mi|ft|km|%|\$)",NORM_VALUE="REMOVE" - //////////////////// // POSITIVE RULES // //////////////////// @@ -122,9 +35,9 @@ RULENAME="date_r0h",EXTRACTION="%reDayNumber[\.]%reMonthNumber[\.]%reYear4Digit" // EXAMPLE r1a_3: Feb. 25, 2009, Monday // EXAMPLE r1b_1: 25 February 2009 // EXAMPLE r1c_1: 25 of February 2009 -RULENAME="date_r1a",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayNumberTh|%reDayNumber)[\s]?,? %reYear4Digit(, %reWeekday)?",NORM_VALUE="group(7)-%normMonth(group(1))-%normDay(group(4))" -RULENAME="date_r1b",EXTRACTION="([Tt]he )?(%reDayWordTh|%reDayNumberTh|%reDayNumber) (%reMonthLong|%reMonthShort)([\s]?,)? %reYear4Digit",NORM_VALUE="group(10)-%normMonth(group(6))-%normDay(group(2))" -RULENAME="date_r1c",EXTRACTION="([Tt]he )?(%reDayWordTh|%reDayNumberTh|%reDayNumber) (of) (%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(10)-%normMonth(group(7))-%normDay(group(2))" +RULENAME="date_r1a",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh)\s?,? %reYear4Digit(?:, %reWeekday)?",NORM_VALUE="group(3)-%normMonth(group(1))-%normDay(group(2))" +RULENAME="date_r1b",EXTRACTION="(?:[Tt]he )?%(reDayNumber|reDayNumberTh|reDayWordTh) %(reMonthLong|reMonthShort)\s?,? %reYear4Digit",NORM_VALUE="group(3)-%normMonth(group(2))-%normDay(group(1))" +RULENAME="date_r1c",EXTRACTION="(?:[Tt]he )?%(reDayNumber|reDayNumberTh|reDayWordTh) of %(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(3)-%normMonth(group(2))-%normDay(group(1))" RULENAME="date_r1d",EXTRACTION="%reDayNumber[‐–-]%reMonthShort[‐–-]%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(3)-%normMonth(group(2))-%normDay(group(1))" // date_r2 @@ -138,15 +51,15 @@ RULENAME="date_r1d",EXTRACTION="%reDayNumber[‐–-]%reMonthShort[‐–-]%reYe // EXAMPLE r2c_3: 19th of November // EXAMPLE r2d_1: 3 to 6 May (find May 3) // EXAMPLE r2e_2: 3 to 6 May 2004 (find May 3, 2004) -RULENAME="date_r2a",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(4))" -RULENAME="date_r2b",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber)%reAndOrTo(%reDayWordTh|%reDayNumberTh|%reDayNumber)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(9))",OFFSET="group(9)-group(9)" -RULENAME="date_r2c",EXTRACTION="([Tt]he )?(%reDayWordTh|%reDayNumberTh|%reDayNumber)( of | )(%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(7))-%normDay(group(2))" -RULENAME="date_r2d",EXTRACTION="(%reDayWordTh|%reDayNumberTh|%reDayNumber[\.]?)%reAndOrTo(%reDayWordTh|%reDayNumberTh|%reDayNumber[.]?) (%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(10))-%normDay(group(1))",OFFSET="group(1)-group(1)" -RULENAME="date_r2e",EXTRACTION="(%reDayWordTh|%reDayNumberTh|%reDayNumber[\.]?)%reAndOrTo(%reDayWordTh|%reDayNumberTh|%reDayNumber[.]?) (%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(13)-%normMonth(group(10))-%normDay(group(1))",OFFSET="group(1)-group(1)" +RULENAME="date_r2a",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(2))" +RULENAME="date_r2b",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh)%reAndOrTo%(reDayNumber|reDayNumberTh|reDayWordTh)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(4))",OFFSET="group(4)-group(4)" +RULENAME="date_r2c",EXTRACTION="(?:[Tt]he )?%(reDayNumber|reDayNumberTh|reDayWordTh) (?:of )?%(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(2))-%normDay(group(1))" +RULENAME="date_r2d",EXTRACTION="%(reDayNumber|reDayNumberTh|reDayWordTh)%reAndOrTo%(reDayNumber|reDayNumberTh|reDayWordTh) %reMonthShort",NORM_VALUE="UNDEF-year-%normMonth(group(4))-%normDay(group(1))",OFFSET="group(1)-group(1)" +RULENAME="date_r2e",EXTRACTION="%(reDayNumber|reDayNumberTh|reDayWordTh)%reAndOrTo%(reDayNumber|reDayNumberTh|reDayWordTh) %reMonthShort %reYear4Digit",NORM_VALUE="group(5)-%normMonth(group(4))-%normDay(group(1))",OFFSET="group(1)-group(1)" // EXAMPLE r2a2_1: January 19th of that year -// EXAMPLE r2b2_1: 19th of January of the same year -RULENAME="date_r2a2",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber) of (that|the same) year",NORM_VALUE="UNDEF-REF-year-PLUS-0-%normMonth(group(1))-%normDay(group(4))" -RULENAME="date_r2c2",EXTRACTION="([Tt]he )?(%reDayWordTh|%reDayNumberTh|%reDayNumber)( of | )(%reMonthLong|%reMonthShort) of (that|the same) year",NORM_VALUE="UNDEF-REF-year-PLUS-0-%normMonth(group(7))-%normDay(group(2))" +// EXAMPLE r2c2_1: 19th of January of the same year +RULENAME="date_r2a2",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh) of (?:that|the same) year",NORM_VALUE="UNDEF-REF-year-PLUS-0-%normMonth(group(1))-%normDay(group(2))" +RULENAME="date_r2c2",EXTRACTION="(?:[Tt]he )?%(reDayNumber|reDayNumberTh|reDayWordTh) (?:of )?%(reMonthLong|reMonthShort) of (?:that|the same) year",NORM_VALUE="UNDEF-REF-year-PLUS-0-%normMonth(group(2))-%normDay(group(1))" // date_r3 @@ -154,14 +67,14 @@ RULENAME="date_r2c2",EXTRACTION="([Tt]he )?(%reDayWordTh|%reDayNumberTh|%reDayNu // EXAMPLE r3a_2: Monday, Oct 12 // EXAMPLE r3b_1: Friday October 13 2009 // EXAMPLE r3b_2: Monday, October 12th 2009 -RULENAME="date_r3a",EXTRACTION="%reWeekday[,]? (%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber)",NORM_VALUE="UNDEF-year-%normMonth(group(2))-%normDay(group(5))" -RULENAME="date_r3b",EXTRACTION="%reWeekday[,]? (%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber)[,]? %reYear4Digit",NORM_VALUE="group(9)-%normMonth(group(2))-%normDay(group(5))" +RULENAME="date_r3a",EXTRACTION="%reWeekday,? %(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh)",NORM_VALUE="UNDEF-year-%normMonth(group(2))-%normDay(group(3))" +RULENAME="date_r3b",EXTRACTION="%reWeekday,? %(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh),? %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(2))-%normDay(group(3))" // date_r4 // EXAMPLE r4a_1: September 14 and 18, 2010 (find September 14 2010) // EXAMPLE r4b_1: September 14 and 18, 2010 (find September 18 2010) -RULENAME="date_r4a",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayNumberTh|%reDayNumber)%reAndOrTo(%reDayNumberTh|%reDayNumber)[,]? %reYear4Digit",NORM_VALUE="group(11)-%normMonth(group(1))-%normDay(group(4))",OFFSET="group(0)-group(4)" -RULENAME="date_r4b",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayNumberTh|%reDayNumber)%reAndOrTo(%reDayNumberTh|%reDayNumber)[,]? %reYear4Digit",NORM_VALUE="group(11)-%normMonth(group(1))-%normDay(group(8))",OFFSET="group(8)-group(11)" +RULENAME="date_r4a",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh)%reAndOrTo%(reDayNumber|reDayNumberTh),? %reYear4Digit",NORM_VALUE="group(5)-%normMonth(group(1))-%normDay(group(2))",OFFSET="group(0)-group(2)" +RULENAME="date_r4b",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh)%reAndOrTo%(reDayNumber|reDayNumberTh),? %reYear4Digit",NORM_VALUE="group(5)-%normMonth(group(1))-%normDay(group(4))",OFFSET="group(4)-group(5)" // date_r5 // EXAMPLE r5a_1: tomorrow @@ -178,7 +91,7 @@ RULENAME="date_r5d",EXTRACTION="%rePartWords %reWeekday",NORM_VALUE="UNDEF-day-% ////////////////////// // date_r6 // EXAMPLE r6a_1: the weekend -RULENAME="date_r61",EXTRACTION="(the|that) weekend",NORM_VALUE="UNDEF-last-week-WE" +RULENAME="date_r61",EXTRACTION="(?:the|that) weekend",NORM_VALUE="UNDEF-last-week-WE" /////////////////////// // MONTH GRANULARITY // @@ -188,16 +101,17 @@ RULENAME="date_r61",EXTRACTION="(the|that) weekend",NORM_VALUE="UNDEF-last-week- // EXAMPLE r7a_2: Nov. 2001 // EXAMPLE r7a_3: February of 1999 // EXAMPLE r7b_1: May and June 2011 (find May 2001) -RULENAME="date_r7a",EXTRACTION="(%reMonthLong|%reMonthShort)( of | )%reYear4Digit",NORM_VALUE="group(5)-%normMonth(group(1))" -RULENAME="date_r7b",EXTRACTION="(%reMonthLong|%reMonthShort)( of | )%reNumWordTeen( |-)%reNumWord2D",NORM_VALUE="%normDurationNumber(group(5))%normDurationNumber(group(7))-%normMonth(group(1))" -RULENAME="date_r7c",EXTRACTION="(%reMonthLong|%reMonthShort) (and|or|to|until) (%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(8)-%normMonth(group(1))",OFFSET="group(1)-group(1)" +RULENAME="date_r7a",EXTRACTION="%(reMonthLong|reMonthShort) (?:of )?%reYear4Digit",NORM_VALUE="group(2)-%normMonth(group(1))" +RULENAME="date_r7b",EXTRACTION="%(reMonthLong|reMonthShort) (?:of )?%reNumWordTeen[ -]%reNumWord2D",NORM_VALUE="%normDurationNumber(group(2))%normDurationNumber(group(3))-%normMonth(group(1))" +RULENAME="date_r7c",EXTRACTION="%(reMonthLong|reMonthShort)%reAndOrTo%(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(1))",OFFSET="group(1)-group(1)" +RULENAME="date_r7d",EXTRACTION="%(reMonthLong|reMonthShort)%reAndOrTo%(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(3))",OFFSET="group(3)-group(4)" // date_r8 // EXAMPLE r8a_1: November next year // EXAMPLE r8a_2: May last year // EXAMPLE -RULENAME="date_r8a",EXTRACTION="%reMonthLong (the )?%reThisNextLast year",NORM_VALUE="UNDEF-%normThisNextLast(group(3))-year-%normMonth(group(1))" -RULENAME="date_r8b",EXTRACTION="%reMonthLong of (that|the same) year",NORM_VALUE="UNDEF-REF-year-MINUS-0-%normMonth(group(1))" +RULENAME="date_r8a",EXTRACTION="%reMonthLong (?:the )?%reThisNextLast year",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-year-%normMonth(group(1))" +RULENAME="date_r8b",EXTRACTION="%reMonthLong of (?:that|the same) year",NORM_VALUE="UNDEF-REF-year-MINUS-0-%normMonth(group(1))" //////////////////////// // SEASON GRANULARITY // @@ -207,10 +121,10 @@ RULENAME="date_r8b",EXTRACTION="%reMonthLong of (that|the same) year",NORM_VALUE // EXAMPLE r9b_1: winter 2001 // EXAMPLE r9b_1: winter of 2001 // EXAMPLE r9c_1: summer of 69 -RULENAME="date_r9a",EXTRACTION="(%rePartWords |[Tt]he )?%reSeason",NORM_VALUE="UNDEF-year-%normSeason(group(3))",NORM_MOD="%normPartWords(group(2))" -RULENAME="date_r9b",EXTRACTION="(%rePartWords |[Tt]he )?%reSeason( of | )%reYear4Digit",NORM_VALUE="group(5)-%normSeason(group(3))",NORM_MOD="%normPartWords(group(2))" -RULENAME="date_r9c",EXTRACTION="(%rePartWords |[Tt]he )?%reSeason( of | )%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(5)-%normSeason(group(3))",NORM_MOD="%normPartWords(group(2))" -RULENAME="date_r9d",EXTRACTION="(%rePartWords |[Tt]he )?%reSeason( of | )%reNumWordTeen( |-)%reNumWord2D",NORM_VALUE="%normDurationNumber(group(5))%normDurationNumber(group(7))-%normSeason(group(3))",NORM_MOD="%normPartWords(group(2))" +RULENAME="date_r9a",EXTRACTION="(?:%rePartWords |[Tt]he |)%reSeason",NORM_VALUE="UNDEF-year-%normSeason(group(2))",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_r9b",EXTRACTION="(?:%rePartWords |[Tt]he |)%reSeason (?:of )?%reYear4Digit",NORM_VALUE="group(3)-%normSeason(group(2))",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_r9c",EXTRACTION="(?:%rePartWords |[Tt]he |)%reSeason (?:of )?'?%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(3)-%normSeason(group(2))",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_r9d",EXTRACTION="(?:%rePartWords |[Tt]he |)%reSeason (?:of )?%reNumWordTeen[ -]%reNumWord2D",NORM_VALUE="%normDurationNumber(group(3))%normDurationNumber(group(4))-%normSeason(group(2))",NORM_MOD="%normPartWords(group(1))" ////////////////////////////// @@ -220,9 +134,10 @@ RULENAME="date_r9d",EXTRACTION="(%rePartWords |[Tt]he )?%reSeason( of | )%reNumW // EXAMPLE r10a_1: the third quarter of 2001 // EXAMPLE r10b_1: the second half // EXAMPLE r10c_1: the 2001 third quarter -RULENAME="date_r10a",EXTRACTION="([Tt]he )?%rePartOfYear( of | )%reYear4Digit",NORM_VALUE="group(4)-%normPartOfYear(group(2))" -RULENAME="date_r10b",EXTRACTION="([Tt]he )?%rePartOfYear",NORM_VALUE="UNDEF-year-%normPartOfYear(group(2))" -RULENAME="date_r10c",EXTRACTION="([Tt]he )?%reYear4Digit %rePartOfYear",NORM_VALUE="group(2)-%normPartOfYear(group(3))" +RULENAME="date_r10a",EXTRACTION="(?:[Tt]he )?%rePartOfYear (?:of )?%reYear4Digit",NORM_VALUE="group(2)-%normPartOfYear(group(1))" +// TODO: causes many false positives from sports, but helps with finance +RULENAME="date_r10b",EXTRACTION="(?:[Tt]he )?%rePartOfYear",NORM_VALUE="UNDEF-year-%normPartOfYear(group(1))" +RULENAME="date_r10c",EXTRACTION="(?:[Tt]he )?%reYear4Digit %rePartOfYear",NORM_VALUE="group(1)-%normPartOfYear(group(2))" // date_r11 // EXAMPLE r11a_1: this year's third quarter @@ -230,8 +145,8 @@ RULENAME="date_r10c",EXTRACTION="([Tt]he )?%reYear4Digit %rePartOfYear",NORM_VAL // EXAMPLE r11b_1: the year-earlier first half // EXAMPLE r11c_1: the second half of this year RULENAME="date_r11a",EXTRACTION="%reThisNextLast year's %rePartOfYear",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-year-%normPartOfYear(group(2))" -RULENAME="date_r11b",EXTRACTION="[Tt]he (year-earlier|year-ago) %rePartOfYear",NORM_VALUE="UNDEF-last-year-%normPartOfYear(group(2))" -RULENAME="date_r11c",EXTRACTION="([Tt]he )?%rePartOfYear of %reThisNextLast year",NORM_VALUE="UNDEF-%normThisNextLast(group(3))-year-%normPartOfYear(group(2))" +RULENAME="date_r11b",EXTRACTION="[Tt]he (?:year-earlier|year-ago) %rePartOfYear",NORM_VALUE="UNDEF-last-year-%normPartOfYear(group(1))" +RULENAME="date_r11c",EXTRACTION="(?:[Tt]he )?%rePartOfYear of %reThisNextLast year",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-year-%normPartOfYear(group(1))" ////////////////////// // YEAR GRANULARITY // @@ -241,12 +156,14 @@ RULENAME="date_r11c",EXTRACTION="([Tt]he )?%rePartOfYear of %reThisNextLast year // EXAMPLE r12b_1: 1850-58 (find: 1858) // EXAMPLE r12c_1: nineteen ninety-one // EXAMPLE r12d_1: two-thousand ten -RULENAME="date_r12a",EXTRACTION="(the year )?%reYear4Digit",NORM_VALUE="group(2)" -//RULENAME="date_r12b",EXTRACTION="%reYear4Digit(-|–| and )%reYear2Digit",NORM_VALUE="%SUBSTRING%(group(1),0,2)group(3)",OFFSET="group(3)-group(3)" +RULENAME="date_r12a",EXTRACTION="(?:the year )?%reYear4Digit",NORM_VALUE="group(1)" +//RULENAME="date_r12b",EXTRACTION="%reYear4Digit(?:-|–| and )%reYear2Digit",NORM_VALUE="%SUBSTRING%(group(1),0,2)group(2)",OFFSET="group(2)-group(2)" RULENAME="date_r12b",EXTRACTION="%reYear4Digit%reAndOrTo%reYear2Digit",NORM_VALUE="%SUBSTRING%(group(1),0,2)group(3)",OFFSET="group(3)-group(3)" -RULENAME="date_r12c",EXTRACTION="%reNumWordTeen( |-)%reNumWord2D",NORM_VALUE="%normDurationNumber(group(1))%normDurationNumber(group(3))" -RULENAME="date_r12d",EXTRACTION="two( |-)thousand( and)? (%reNumWord2D|%reNumWord1D)",NORM_VALUE="20%normDurationNumber(group(3))" -RULENAME="date_r12e",EXTRACTION="[Tt]he year two( |-)thousand",NORM_VALUE="2000" +RULENAME="date_r12c",EXTRACTION="%reNumWordTeen[ -]%reNumWord2D",NORM_VALUE="%normDurationNumber(group(1))%normDurationNumber(group(2))" +RULENAME="date_r12d",EXTRACTION="two[ -]thousand(?: and)? %(reNumWord1D|reNumWord2D)",NORM_VALUE="20%normDurationNumber(group(1))" +RULENAME="date_r12e",EXTRACTION="[Tt]he year two[ -]thousand",NORM_VALUE="2000" +RULENAME="date_r12f1",EXTRACTION="%reYear4Digit%reAndOrTo%reYear4Digit",NORM_VALUE="group(1)",OFFSET="group(1)-group(1)" +RULENAME="date_r12f2",EXTRACTION="%reYear4Digit%reAndOrTo%reYear4Digit",NORM_VALUE="group(3)",OFFSET="group(3)-group(3)" //////////////////////// // DECADE GRANULARITY // @@ -256,30 +173,30 @@ RULENAME="date_r12e",EXTRACTION="[Tt]he year two( |-)thousand",NORM_VALUE="2000" // EXAMPLE r13b_1: the 90s // EXAMPLE r13c_1: the seventies // EXAMPLE r13d_1: the nineteen seventies -RULENAME="date_r13a",EXTRACTION="([Tt]he )?(\d\d\d0)[']?[s]",NORM_VALUE="%SUBSTRING%(group(2),0,3)" -RULENAME="date_r13b",EXTRACTION="([Tt]he )?[']?(\d0)[']?[s]",NORM_VALUE="19%SUBSTRING%(group(2),0,1)" -RULENAME="date_r13c",EXTRACTION="([Tt]he )?%reDecadeWord",NORM_VALUE="19%normDecadeWord(group(2))" -RULENAME="date_r13d",EXTRACTION="([Tt]he )?%reNumWordTeen %reDecadeWord",NORM_VALUE="%normDurationNumber(group(2))%normDecadeWord(group(3))" +RULENAME="date_r13a",EXTRACTION="(?:[Tt]he )?(\d\d\d0)'?s",NORM_VALUE="%SUBSTRING%(group(1),0,3)" +RULENAME="date_r13b",EXTRACTION="(?:[Tt]he )?'?(\d0)'?s",NORM_VALUE="19%SUBSTRING%(group(1),0,1)" +RULENAME="date_r13c",EXTRACTION="(?:[Tt]he )?%reDecadeWord",NORM_VALUE="19%normDecadeWord(group(1))" +RULENAME="date_r13d",EXTRACTION="(?:[Tt]he )?%reNumWordTeen %reDecadeWord",NORM_VALUE="%normDurationNumber(group(1))%normDecadeWord(group(2))" // date_r14 // EXAMPLE r14a_1: the early 1990s // EXAMPLE r14b_1: the mid-90s // EXAMPLE r14c_1: the late seventies // EXAMPLE r14d_1: the early nineteen seventies -RULENAME="date_r14a",EXTRACTION="([Tt]he )?%rePartWords[\s]?(\d\d\d0)[']?[s]",NORM_VALUE="%SUBSTRING%(group(3),0,3)",NORM_MOD="%normPartWords(group(2))" -RULENAME="date_r14b",EXTRACTION="([Tt]he )?%rePartWords[\s]?[']?%reYear2Digit[']?[s]",NORM_VALUE="19%SUBSTRING%(group(3),0,1)",NORM_MOD="%normPartWords(group(2))" -RULENAME="date_r14c",EXTRACTION="([Tt]he )?%rePartWords[\s]?%reDecadeWord",NORM_VALUE="19%normDecadeWord(group(3))",NORM_MOD="%normPartWords(group(2))" -RULENAME="date_r14d",EXTRACTION="([Tt]he )?%rePartWords[\s]?%reNumWordTeen %reDecadeWord",NORM_VALUE="%normDurationNumber(group(3))%normDecadeWord(group(4))",NORM_MOD="%normPartWords(group(2))" +RULENAME="date_r14a",EXTRACTION="(?:[Tt]he )?%rePartWords\s?(\d\d\d0)'?s",NORM_VALUE="%SUBSTRING%(group(2),0,3)",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_r14b",EXTRACTION="(?:[Tt]he )?%rePartWords\s?'?%reYear2Digit'?s",NORM_VALUE="19%SUBSTRING%(group(2),0,1)",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_r14c",EXTRACTION="(?:[Tt]he )?%rePartWords\s?%reDecadeWord",NORM_VALUE="19%normDecadeWord(group(2))",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_r14d",EXTRACTION="(?:[Tt]he )?%rePartWords\s?%reNumWordTeen %reDecadeWord",NORM_VALUE="%normDurationNumber(group(2))%normDecadeWord(group(3))",NORM_MOD="%normPartWords(group(1))" ///////////////////////// // CENTURY GRANULARITY // ///////////////////////// //// EXAMPLE r15a_1: the 19th century //// EXAMPLE r15a_2: the seventh century -RULENAME="date_r15a",EXTRACTION="([Tt]he )?(%reDayNumberTh|%reDayWordTh) [Cc]entur(y|ies)",NORM_VALUE="%normDay(%SUM%(%normDay(group(2)),-1))" -RULENAME="date_r15b",EXTRACTION="%rePartWords( the)? (%reDayNumberTh|%reDayWordTh) [Cc]entur(y|ies)",NORM_VALUE="%normDay(%SUM%(%normDay(group(3)),-1))",NORM_MOD="%normPartWords(group(1))" -RULENAME="date_r15c",EXTRACTION="(([Tt]he )?(%reDayNumberTh|%reDayWordTh))%reAndOrTo(the )?(%reDayNumberTh|%reDayWordTh) [Cc]entur(y|ies)",NORM_VALUE="%normDay(%SUM%(%normDay(group(3)),-1))",OFFSET="group(1)-group(1)" -RULENAME="date_r15d",EXTRACTION="(%rePartWords( the)? (%reDayNumberTh|%reDayWordTh))%reAndOrTo(the )?(%reDayNumberTh|%reDayWordTh) [Cc]entur(y|ies)?",NORM_VALUE="%normDay(%SUM%(%normDay(group(4)),-1))",OFFSET="group(1)-group(1)",NORM_MOD="%normPartWords(group(2))" +RULENAME="date_r15a",EXTRACTION="(?:[Tt]he )?%(reDayNumberTh|reDayWordTh) [Cc]entur(?:y|ies)",NORM_VALUE="%normDay(%SUM%(%normDay(group(1)),-1))" +RULENAME="date_r15b",EXTRACTION="%rePartWords(?: the)? %(reDayNumberTh|reDayWordTh) [Cc]entur(?:y|ies)",NORM_VALUE="%normDay(%SUM%(%normDay(group(2)),-1))",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_r15c",EXTRACTION="(?:[Tt]he )?%(reDayNumberTh|reDayWordTh)%reAndOrTo(?:the )?(?:%rePartWords )?%(reDayNumberTh|reDayWordTh) [Cc]entur(?:y|ies)",NORM_VALUE="%normDay(%SUM%(%normDay(group(1)),-1))",OFFSET="group(0)-group(1)" +RULENAME="date_r15d",EXTRACTION="%rePartWords(?: the)? %(reDayNumberTh|reDayWordTh)%reAndOrTo(?:the )?%(reDayNumberTh|reDayWordTh) [Cc]entur(?:y|ies)?",NORM_VALUE="%normDay(%SUM%(%normDay(group(2)),-1))",OFFSET="group(0)-group(2)",NORM_MOD="%normPartWords(group(1))" /////////////////////////////////// // GRANULARITY INDEPENDENT RULES // @@ -289,12 +206,12 @@ RULENAME="date_r15d",EXTRACTION="(%rePartWords( the)? (%reDayNumberTh|%reDayWord // EXAMPLE r16b_1: Early 2001 // EXAMPLE r16c_1: the beginning of November 1999 // EXAMPLE r16d_1: the middle of September -RULENAME="date_r16a",EXTRACTION="(%reMonthLong)",NORM_VALUE="UNDEF-year-%normMonth(group(1))" +RULENAME="date_r16a",EXTRACTION="%reMonthLong",NORM_VALUE="UNDEF-year-%normMonth(group(1))" // 2015-03, Jannik: abbreviated month name on its own is quite dangerous -//RULENAME="date_r16a",EXTRACTION="(%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(1))" -RULENAME="date_r16b",EXTRACTION="%rePartWords([ ]?)%reYear4Digit",NORM_VALUE="group(3)",NORM_MOD="%normPartWords(group(1))" -RULENAME="date_r16c",EXTRACTION="%rePartWords([ ]?)(%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(6)-%normMonth(group(3))",NORM_MOD="%normPartWords(group(1))" -RULENAME="date_r16d",EXTRACTION="%rePartWords([ ]?)(%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(3))",NORM_MOD="%normPartWords(group(1))" +//RULENAME="date_r16a",EXTRACTION="%(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(1))" +RULENAME="date_r16b",EXTRACTION="%rePartWords ?%reYear4Digit",NORM_VALUE="group(2)",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_r16c",EXTRACTION="%rePartWords ?%(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(3)-%normMonth(group(2))",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_r16d",EXTRACTION="%rePartWords ?%(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(2))",NORM_MOD="%normPartWords(group(1))" // date_r17 // EXAMPLE r17a_1: this year @@ -303,15 +220,15 @@ RULENAME="date_r16d",EXTRACTION="%rePartWords([ ]?)(%reMonthLong|%reMonthShort)" // EXAMPLE r17d_1: this Monday // EXAMPLE r17e_1: this summer // EXAMPLE r17f_1: this day (using UNDEF-REF normalization) -RULENAME="date_r17a",EXTRACTION="([Tt]he )?%reThisNextLast %reUnit",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-%normUnit(group(3))" -RULENAME="date_r17b",EXTRACTION="([Tt]he )?%reThisNextLast %reMonthLong",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-%LOWERCASE%(group(3))" -RULENAME="date_r17c",EXTRACTION="([Tt]he )?%reThisNextLast %reMonthLong %reDayNumber",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-%LOWERCASE%(group(3))-%normDay(group(4))" -RULENAME="date_r17d",EXTRACTION="([Tt]he )?%reThisNextLast %reWeekday",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-%LOWERCASE%(group(3))" -RULENAME="date_r17e",EXTRACTION="([Tt]he )?%reThisNextLast %reSeason",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-%normSeason(group(3))" +RULENAME="date_r17a",EXTRACTION="(?:[Tt]he )?%reThisNextLast %reUnit(?! of)",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-%normUnit(group(2))" +RULENAME="date_r17b",EXTRACTION="(?:[Tt]he )?%reThisNextLast %reMonthLong(?! of)",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-year-%normMonth(group(2))" +RULENAME="date_r17c",EXTRACTION="(?:[Tt]he )?%reThisNextLast %reMonthLong %reDayNumber(?! of)",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-year-%normMonth(group(2))-%normDay(group(3))" +RULENAME="date_r17d",EXTRACTION="(?:[Tt]he )?%reThisNextLast %reWeekday(?! of)",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-week-%normDayInWeek(group(2))" +RULENAME="date_r17e",EXTRACTION="(?:[Tt]he )?%reThisNextLast %reSeason(?! of)",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-year-%normSeason(group(2))" RULENAME="date_r17f",EXTRACTION="[Tt]his day",NORM_VALUE="UNDEF-REF-day-PLUS-0" -RULENAME="date_r17g",EXTRACTION="([Tt]he )?following %reUnit",NORM_VALUE="UNDEF-REF-%normUnit(group(2))-PLUS-1" -RULENAME="date_r17h",EXTRACTION="([Tt]he |[Tt]hat |[Tt]his )?same (day|month|year)",NORM_VALUE="UNDEF-REF-%normUnit(group(2))-PLUS-0" -//RULENAME="date_r17i",EXTRACTION="([Tt]he )?previous %reUnit",NORM_VALUE="UNDEF-REF-%normUnit(group(2))-MINUS-1" +RULENAME="date_r17g",EXTRACTION="(?:[Tt]he )?following %reUnit",NORM_VALUE="UNDEF-REF-%normUnit(group(1))-PLUS-1" +RULENAME="date_r17h",EXTRACTION="(?:[Tt]he |[Tt]hat |[Tt]his |)same (day|month|year)",NORM_VALUE="UNDEF-REF-%normUnit(group(1))-PLUS-0" +//RULENAME="date_r17i",EXTRACTION="(?:[Tt]he )?previous %reUnit",NORM_VALUE="UNDEF-REF-%normUnit(group(1))-MINUS-1" // date_r18 // EXAMPLE r18a_1: the beginning of this year @@ -319,55 +236,59 @@ RULENAME="date_r17h",EXTRACTION="([Tt]he |[Tt]hat |[Tt]his )?same (day|month|yea // EXAMPLE r18c_1: the beginning of this November 24 // EXAMPLE r18d_1: the beginning of this Monday // EXAMPLE r18e_1: the beginning of this summer -RULENAME="date_r18a",EXTRACTION="([Tt]he )?%rePartWords([ ]?)%reThisNextLast %reUnit",NORM_VALUE="UNDEF-%normThisNextLast(group(4))-%normUnit(group(5))",NORM_MOD="%normPartWords(group(2))" -RULENAME="date_r18b",EXTRACTION="([Tt]he )?%rePartWords([ ]?)%reThisNextLast %reMonthLong",NORM_VALUE="UNDEF-%normThisNextLast(group(4))-%LOWERCASE%(group(5))",NORM_MOD="%normPartWords(group(2))" -RULENAME="date_r18c",EXTRACTION="([Tt]he )?%rePartWords([ ]?)%reThisNextLast %reMonthLong %reDayNumber",NORM_VALUE="UNDEF-%normThisNextLast(group(4))-%LOWERCASE%(group(5))-%normDay(group(6))",NORM_MOD="%normPartWords(group(2))" -RULENAME="date_r18d",EXTRACTION="([Tt]he )?%rePartWords([ ]?)%reThisNextLast %reWeekday",NORM_VALUE="UNDEF-%normThisNextLast(group(4))-%LOWERCASE%(group(5))",NORM_MOD="%normPartWords(group(2))" -RULENAME="date_r18e",EXTRACTION="([Tt]he )?%rePartWords([ ]?)%reThisNextLast %reSeason",NORM_VALUE="UNDEF-%normThisNextLast(group(4))-%normSeason(group(5))",NORM_MOD="%normPartWords(group(2))" +RULENAME="date_r18a",EXTRACTION="(?:[Tt]he |)%rePartWords ?%reThisNextLast %reUnit",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-%normUnit(group(3))",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_r18b",EXTRACTION="(?:[Tt]he |)%rePartWords ?%reThisNextLast %reMonthLong",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-year-%normMonth(group(3))",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_r18c",EXTRACTION="(?:[Tt]he |)%rePartWords ?%reThisNextLast %reMonthLong %reDayNumber",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-year-%normMonth(group(3))-%normDay(group(4))",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_r18d",EXTRACTION="(?:[Tt]he |)%rePartWords ?%reThisNextLast %reWeekday",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-week-%normDayInWeek(group(3))",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_r18e",EXTRACTION="(?:[Tt]he |)%rePartWords ?%reThisNextLast %reSeason",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-year-%normSeason(group(3))",NORM_MOD="%normPartWords(group(1))" // date_r19 (ago) // EXAMPLE r19a_1: at least several years ago // EXAMPLE r19b_1: about twenty years ago // EXAMPLE r19c_1: about 20 years ago // EXAMPLE r19d_1: a month ago -RULENAME="date_r19a",EXTRACTION="(%reApproximate )?(several|a couple of|some|a few|many) (%reUnit|minutes|hours)( or so| or more)? (ago|earlier)",NORM_VALUE="PAST_REF" -RULENAME="date_r19b",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D) (%reUnit|minutes|hours)( or so| or more)? ago",NORM_VALUE="UNDEF-this-%normUnit(group(6))-MINUS-%normDurationNumber(group(3))",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="date_r19c",EXTRACTION="(%reApproximate )?([\d]+) (%reUnit|minutes|hours)( or so| or more)? ago",NORM_VALUE="UNDEF-this-%normUnit(group(4))-MINUS-group(3)",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="date_r19d",EXTRACTION="(%reApproximate )?(an|a) (week-ends?|weekends?|trading days?|days?|months?|weeks?|decades?|century|quarters?|centuries|years?)( or so| or more)? ago",NORM_VALUE="UNDEF-this-%normUnit(group(4))-MINUS-1",NORM_MOD="%normApprox4Dates(group(2))" +RULENAME="date_r19a",EXTRACTION="(?:%reApproximate )?(?:several|a couple of|some|a few|many) %reUnitFine(?: or so| or more|) (?:ago|earlier)",NORM_VALUE="PAST_REF" +RULENAME="date_r19b",EXTRACTION="(%reApproximate )?%(reNumWord1D|reNumWord2D) %reUnitFine(?: or so| or more|) ago",NORM_VALUE="UNDEF-this-%normUnit(group(4))-MINUS-%normDurationNumber(group(3))",NORM_MOD="%normApprox4Dates(group(2))" +RULENAME="date_r19c",EXTRACTION="(%reApproximate )?(\d+) %reUnitFine(?: or so| or more)? ago",NORM_VALUE="UNDEF-this-%normUnit(group(4))-MINUS-group(3)",NORM_MOD="%normApprox4Dates(group(2))" +RULENAME="date_r19d",EXTRACTION="(%reApproximate )?an? (week-ends?|weekends?|trading days?|days?|months?|weeks?|decades?|century|quarters?|centuries|years?)(?: or so| or more)? ago",NORM_VALUE="UNDEF-this-%normUnit(group(3))-MINUS-1",NORM_MOD="%normApprox4Dates(group(2))" RULENAME="date_r19e",EXTRACTION="coming %reUnit",NORM_VALUE="FUTURE_REF" // date_r20 (later) // EXAMPLE r20a_1: some days later // EXAMPLE r20b_1: about twenty days later // EXAMPLE r20c_1: about 20 days later -// EXAMPLE r20d_1: a year later -RULENAME="date_r20a",EXTRACTION="(%reApproximate )?(several|a couple of|some|a few|many) (%reUnit|minutes|hours) later",NORM_VALUE="FUTURE_REF" -RULENAME="date_r20b",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D) (%reUnit|minutes|hours) later",NORM_VALUE="UNDEF-REF-%normUnit(group(6))-PLUS-%normDurationNumber(group(3))",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="date_r20c",EXTRACTION="(%reApproximate )?([\d]+) (%reUnit|minutes|hours) later",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-PLUS-group(3)",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="date_r20d",EXTRACTION="(%reApproximate )?(an|a) (%reUnit) later",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-PLUS-1",NORM_MOD="%normApprox4Dates(group(2))" +// EXAMPLE r20d_1: a week later +// EXAMPLE r20f_1: on 30 minutes [something happened] +// EXAMPLE r20g_1: on approximately thirty minutes [something happened] +RULENAME="date_r20a",EXTRACTION="(?:%reApproximate )?(?:several|a couple of|some|a few|many) %reUnitFine (?:later|into)",NORM_VALUE="FUTURE_REF" +RULENAME="date_r20b",EXTRACTION="(?:%reApproximate )?%(reNumWord1D|reNumWord2D) %reUnitFine (?:later|into)",NORM_VALUE="UNDEF-REF-%normUnit(group(3))-PLUS-%normDurationNumber(group(2))",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="date_r20c",EXTRACTION="(?:%reApproximate )?(\d+) %reUnitFine (?:later|into)",NORM_VALUE="UNDEF-REF-%normUnit(group(3))-PLUS-group(2)",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="date_r20d",EXTRACTION="(?:%reApproximate )?an? %reUnitFine (?:later|into)",NORM_VALUE="UNDEF-REF-%normUnit(group(2))-PLUS-1",NORM_MOD="%normApprox4Dates(group(1))" RULENAME="date_r20e",EXTRACTION="recent %reUnit",NORM_VALUE="PAST_REF" +RULENAME="date_r20f",EXTRACTION="[Oo]n ((?:%reApproximate )?(\d+) %reUnitFine)",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-PLUS-group(3)",NORM_MOD="%normApprox4Dates(group(2))",GROUP="group(1)-group(1)" +RULENAME="date_r20g",EXTRACTION="[Oo]n ((?:%reApproximate )?%(reNumWord1D|reNumWord2D) %reUnitFine)",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-PLUS-%normDurationNumber(group(3))",NORM_MOD="%normApprox4Dates(group(2))",GROUP="group(1)-group(1)" // date_r21 (earlier) // EXAMPLE r21a_1: twenty days earlier // EXAMPLE r21b_1: about 20 days earlier -// EXAMPLE r21c_1: a year earlier -RULENAME="date_r21a",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D) (%reUnit|minutes|hours) earlier",NORM_VALUE="UNDEF-REF-%normUnit(group(6))-MINUS-%normDurationNumber(group(3))",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="date_r21b",EXTRACTION="(%reApproximate )?([\d]+) (%reUnit|minutes|hours) earlier",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-MINUS-group(3)",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="date_r21c",EXTRACTION="(%reApproximate )?(an|a) (%reUnit) earlier",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-MINUS-1",NORM_MOD="%normApprox4Dates(group(2))" +// EXAMPLE r21c_1: a week earlier +RULENAME="date_r21a",EXTRACTION="(?:%reApproximate )?%(reNumWord1D|reNumWord2D) %reUnitFine earlier",NORM_VALUE="UNDEF-REF-%normUnit(group(3))-MINUS-%normDurationNumber(group(2))",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="date_r21b",EXTRACTION="(?:%reApproximate )?(\d+) %reUnitFine earlier",NORM_VALUE="UNDEF-REF-%normUnit(group(3))-MINUS-group(2)",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="date_r21c",EXTRACTION="(?:%reApproximate )?an? %reUnit earlier",NORM_VALUE="UNDEF-REF-%normUnit(group(2))-MINUS-1",NORM_MOD="%normApprox4Dates(group(1))" // date_r24 (ago/earlier/later normalizing with REFUNIT) // EXAMPLE r22a_1: a year ago // EXAMPLE r22b_1: a year later -RULENAME="date_r22a",EXTRACTION="[Aa] year (ago|earlier)",NORM_VALUE="UNDEF-REFUNIT-year-MINUS-1" -RULENAME="date_r22b",EXTRACTION="[Aa] year (later)",NORM_VALUE="UNDEF-REFUNIT-year-PLUS-1" +RULENAME="date_r22a",EXTRACTION="[Aa] year (?:ago|earlier)",NORM_VALUE="UNDEF-REFUNIT-year-MINUS-1" +RULENAME="date_r22b",EXTRACTION="[Aa] year (?:later)",NORM_VALUE="UNDEF-REFUNIT-year-PLUS-1" // date_r23 // EXAMPLE r23a_1: the year-earlier first quarter // EXAMPLE r23b_1: the year-earlier quarter // EXAMPLE r23c_1: the quarter -RULENAME="date_r23a",EXTRACTION="([Tt]he )?(year-earlier|year-ago) %rePartOfYear",NORM_VALUE="UNDEF-REF-year-MINUS-1-%normPartOfYear(group(3))" -RULENAME="date_r23b",EXTRACTION="([Tt]he|[Tt]hat) (year-earlier|year-ago) quarter",NORM_VALUE="UNDEF-REF-quarter-MINUS-4" -RULENAME="date_r23c",EXTRACTION="([Tt]he|[Tt]hat) quarter",NORM_VALUE="UNDEF-REF-quarter-PLUS-0" +RULENAME="date_r23a",EXTRACTION="(?:[Tt]he )?(year-earlier|year-ago) %rePartOfYear",NORM_VALUE="UNDEF-REF-year-MINUS-1-%normPartOfYear(group(2))" +RULENAME="date_r23b",EXTRACTION="(?:[Tt]he|[Tt]hat) (year-earlier|year-ago) quarter",NORM_VALUE="UNDEF-REF-quarter-MINUS-4" +RULENAME="date_r23c",EXTRACTION="(?:[Tt]he|[Tt]hat) quarter",NORM_VALUE="UNDEF-REF-quarter-PLUS-0" /////////////////// @@ -382,8 +303,7 @@ RULENAME="date_r23c",EXTRACTION="([Tt]he|[Tt]hat) quarter",NORM_VALUE="UNDEF-REF RULENAME="date_r24a",EXTRACTION="%reHolidayFix",NORM_VALUE="UNDEF-year-%normHolidayFix(group(1))" RULENAME="date_r24b",EXTRACTION="%reHolidayFix %reYear4Digit",NORM_VALUE="group(2)-%normHolidayFix(group(1))" -RULENAME="date_r24c",EXTRACTION="%reHolidayFix %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normHolidayFix(group(1))" -RULENAME="date_r24d",EXTRACTION="%reHolidayFix '%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normHolidayFix(group(1))" +RULENAME="date_r24cd",EXTRACTION="%reHolidayFix '?+%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normHolidayFix(group(1))" //date_r25 //EXAMPLE r25a_1: Eastersunday @@ -393,8 +313,95 @@ RULENAME="date_r24d",EXTRACTION="%reHolidayFix '%reYear2Digit",NORM_VALUE="UNDEF RULENAME="date_r25a",EXTRACTION="%reHolidayVar",NORM_VALUE="UNDEF-year-%normHolidayVar(group(1))" RULENAME="date_r25b",EXTRACTION="%reHolidayVar %reYear4Digit",NORM_VALUE="group(2)-%normHolidayVar(group(1))" -RULENAME="date_r25c",EXTRACTION="%reHolidayVar %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normHolidayVar(group(1))" -RULENAME="date_r25d",EXTRACTION="%reHolidayVar '%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normHolidayVar(group(1))" +RULENAME="date_r25cd",EXTRACTION="%reHolidayVar '?+%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normHolidayVar(group(1))" + + +// Note: rule with "-BCADhint" in the rule name contain explicit BC or AD information. +// This information is important during the normalization process. + +/////////////////// +// History RULES // +/////////////////// + +// historic dates; year granularity; with explicit AD / BC hints +// EXAMPLE historic_1a-BCADhint: 190 BC (1- to 4-digit year) +// EXAMPLE historic_1b-BCADhint: BC 190 (1- to 4-digit year) +// EXAMPLE historic_1c-BCADhint: 190 or 180 BC (find "190 BC"; 1- to 4-digit year) +RULENAME="date_historic_1a-BCADhint",EXTRACTION="(?:%reApproximate )?%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(3))%normYearBC(group(2))",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="date_historic_1b-BCADhint",EXTRACTION="(?:%reApproximate )?%reYearPrefix %reYearBC",NORM_VALUE="%normYearPrefix(group(2))%normYearBC(group(3))",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="x_date_historic_1c-BCADhint",EXTRACTION="(?:%reApproximate )?%reYearBC%reAndOrTo%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))%normYearBC(group(2))",NORM_MOD="%normApprox4Dates(group(1))",OFFSET="group(0)-group(2)" + +// historic dates; month granularity +// EXAMPLE historic_2a-BCADhint: March 190 BC (1- to 4-digit year) +// EXAMPLE historic_2b: March 190 (3-digit year) +// EXAMPLE historic_2c: in March 90 (2-digit year) +// EXAMPLE historic_2d: March of 90 (2-digit year) +RULENAME="date_historic_2a-BCADhint",EXTRACTION="(?:%reApproximate )?%(reMonthLong|reMonthShort) (?:of )?%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normMonth(group(2))",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="date_historic_2b",EXTRACTION="(?:%reApproximate )?%(reMonthLong|reMonthShort) (?:of )?([\d][\d][\d])",NORM_VALUE="%normYearBC(group(3))-%normMonth(group(2))",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="date_historic_2c",EXTRACTION="[Ii]n %(reMonthLong|reMonthShort) %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normMonth(group(1))",OFFSET="group(1)-group(2)" +RULENAME="date_historic_2d",EXTRACTION="%(reMonthLong|reMonthShort) of %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normMonth(group(1))" + +// historic dates; day granularity +// EXAMPLE historic_3a-BCADhint: March 29, 190 BC (1- to 4-digit year) +// EXAMPLE historic_3b-BCADhint: 29 March 190 BC (1- to 4-digit year) +// EXAMPLE historic_3c-BCADhint: 29th of March 190 BC (1- to 4-digit year) +// EXAMPLE historic_3d: March 29, 190 (3-digit year) +// EXAMPLE historic_3e: March 29, 90 (2-digit year) +RULENAME="date_historic_3a-BCADhint",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh)\s?,? %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normMonth(group(1))-%normDay(group(2))" +RULENAME="date_historic_3b-BCADhint",EXTRACTION="%reDayNumber %(reMonthLong|reMonthShort)(?:\s?,)? %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normMonth(group(2))-%normDay(group(1))" +RULENAME="date_historic_3c-BCADhint",EXTRACTION="%(reDayNumber|reDayNumberTh|reDayWordTh) of %(reMonthLong|reMonthShort) %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normMonth(group(2))-%normDay(group(1))" +RULENAME="date_historic_3d",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh)\s?,? (\d\d\d)",NORM_VALUE="%normYearBC(group(3))-%normMonth(group(1))-%normDay(group(2))" +RULENAME="date_historic_3e",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh),? (\d\d)",NORM_VALUE="UNDEF-centurygroup(3)-%normMonth(group(1))-%normDay(group(2))" + +// historic dates; season granularity +// EXAMPLE historic_4a-BCADhint: summer of 190 BC (1- to 4-digit year) +RULENAME="date_historic_4a-BCADhint",EXTRACTION="(?:%reApproximate )?(?:the )?%reSeason (?:of )?%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normSeason(group(2))",NORM_MOD="%normApprox4Dates(group(1))" + +// historic dates; century granularity +// EXAMPLE date_historic_5a-BCADhint: the 2nd century BC +// EXAMPLE date_historic_5b-BCADhint: beginning of the 2nd century BC +// EXAMPLE date_historic_5c-BCADhint: 2nd or 3rd century BC (find "2nd century BC") +// EXAMPLE date_historic_5d-BCADhint: beginning of the 2nd or 3rd century BC (find "beginning 2nd century BC") +RULENAME="date_historic_5a-BCADhint",EXTRACTION="(?:[Tt]he )?%(reDayNumberTh|reDayWordTh) [Cc]entur(?:y|ies) %reYearPrefix",NORM_VALUE="%normYearPrefix(group(2))%normDay(%SUM%(%normDay(group(1)),-1))" +RULENAME="date_historic_5b-BCADhint",EXTRACTION="%rePartWords(?: the)? %(reDayNumberTh|reDayWordTh) [Cc]entur(?:y|ies) %reYearPrefix",NORM_VALUE="%normYearPrefix(group(3))%normDay(%SUM%(%normDay(group(2)),-1))",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_historic_5c-BCADhint",EXTRACTION="((?:[Tt]he )?%(reDayNumberTh|reDayWordTh))%reAndOrTo(?:the )?%(reDayNumberTh|reDayWordTh) [Cc]entur(?:y|ies) %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))%normDay(%SUM%(%normDay(group(2)),-1))",OFFSET="group(1)-group(1)" +RULENAME="date_historic_5d-BCADhint",EXTRACTION="(%rePartWords(?: the)? %(reDayNumberTh|reDayWordTh))%reAndOrTo(?:the )?%(reDayNumberTh|reDayWordTh) [Cc]entur(?:y|ies) %reYearPrefix",NORM_VALUE="%normYearPrefix(group(6))%normDay(%SUM%(%normDay(group(3)),-1))",OFFSET="group(1)-group(1)",NORM_MOD="%normPartWords(group(2))" + +// historic dates; decade granularity +// EXAMPLE date_historic_6a-BCADhint: 1990s BC +// EXAMPLE date_historic_6b-BCADhint: 190s BC +// EXAMPLE date_historic_6c-BCADhint: 90s BC +RULENAME="date_historic_6a-BCADhint",EXTRACTION="(?:%rePartWords )?(?:[Tt]he )?([0-9][0-9][0-9]0)'?s %reYearPrefix",NORM_VALUE="%normYearPrefix(group(3))%SUBSTRING%(group(2),0,3)",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_historic_6b-BCADhint",EXTRACTION="(?:%rePartWords )?(?:[Tt]he )?([0-9][0-9]0)'?s %reYearPrefix",NORM_VALUE="%normYearPrefix(group(3))0%SUBSTRING%(group(2),0,2)",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_historic_6c-BCADhint",EXTRACTION="(?:%rePartWords )?(?:[Tt]he )?([0-9]0)'?s %reYearPrefix",NORM_VALUE="%normYearPrefix(group(3))00%SUBSTRING%(group(2),0,1)",NORM_MOD="%normPartWords(group(1))" + +// historic dates; year granularity (no AD / BC hints) +// EXAMPLE date_historic_7a: (in) 190 (3-digit year) +// EXAMPLE date_historic_7b: (in) 190(,) (3-digit year) +// EXAMPLE date_historic_7c: (newline)190(newline) (2- to 4-digit year) +// EXAMPLE date_historic_7d: year of 90 (2-digit year) +// EXAMPLE date_historic_7e: year of 190 (3-digit year) +RULENAME="date_historic_7ab",EXTRACTION="[Ii]n ([0-9][0-9][0-9]),?",NORM_VALUE="%normYearBC(group(1))",OFFSET="group(1)-group(1)" +//RULENAME="date_historic_7b",EXTRACTION="[Ii]n ([0-9][0-9][0-9]),",NORM_VALUE="%normYearBC(group(1))",OFFSET="group(1)-group(1)" +//RULENAME="date_historic_7c",EXTRACTION="\A([0-9][0-9][0-9]?[0-9]?)\Z",NORM_VALUE="%normYearBC(group(1))" +RULENAME="date_historic_7d",EXTRACTION="(?:[Tt]he )?year of ([0-9][0-9])",NORM_VALUE="UNDEF-centurygroup(1)" +RULENAME="date_historic_7e",EXTRACTION="(?:[Tt]he )?year of ([0-9][0-9][0-9])",NORM_VALUE="%normYearBC(group(1))" + +// historic dates; 2-digit year granularity (no AD / BC hints) +// EXAMPLE date_historic_8a: (in) 90(,) (2-digit year) +// EXAMPLE date_historic_8b: (in) 90 (2-digit year) +RULENAME="date_historic_8ab",EXTRACTION="[Ii]n ([0-9][0-9]),?",NORM_VALUE="UNDEF-centurygroup(1)",OFFSET="group(1)-group(1)" +//RULENAME="date_historic_8b",EXTRACTION="[Ii]n ([0-9][0-9])",NORM_VALUE="UNDEF-centurygroup(2)",OFFSET="group(2)-group(2)" + +// historic dates; negative rules +// EXAMPLE date_historic_0a: in 90 cases (2- to 4-digit year) +// EXAMPLE date_historic_0b: in 90 nice cases (2- to 4-digit year) +// EXAMPLE date_historic_0c: in 90 nice law cases (2- to 4-digit year) +// EXAMPLE date_historic_0d: in 90 percent (2- to 4-digit year) +RULENAME="date_historic_0a_negative",EXTRACTION="[Ii]n %reYearBC (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):NNS:",OFFSET="group(1)-group(1)" +RULENAME="date_historic_0b_negative",EXTRACTION="[Ii]n %reYearBC (\S+) (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):JJ:group(3):NNS:",OFFSET="group(1)-group(1)" +RULENAME="date_historic_0c_negative",EXTRACTION="[Ii]n %reYearBC (\S+) (\S+) (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):JJ:group(3):NN:group(4):NNS:",OFFSET="group(1)-group(1)" +RULENAME="date_historic_0d_negative",EXTRACTION="[Ii]n %reYearBC (?:kilometers?|miles?|foot|feet|dollars?|percents?|millions?|mi|ft|km|%|\$)",NORM_VALUE="REMOVE" //////////////////// @@ -408,46 +415,50 @@ RULENAME="date_r25d",EXTRACTION="%reHolidayVar '%reYear2Digit",NORM_VALUE="UNDEF // EXAMPLE r2b_negative_1: they march the way (if it is a verb) // EXAMPLE r2c_negative_1: may (if it is a verb) // EXAMPLE r2d_negative_1: may (or march, fall -- if it is lower case and without any further temporal stuff around it...) -RULENAME="x_date_r2a_negative",EXTRACTION="(%reMonthLong|%reMonthShort)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):VBP:" -RULENAME="x_date_r2b_negative",EXTRACTION="(%reMonthLong|%reMonthShort)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):VVP:" -RULENAME="x_date_r2c_negative",EXTRACTION="(%reMonthLong|%reMonthShort)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):MD:" -RULENAME="x_date_r2d1_negative",EXTRACTION="(may|march|fall)",NORM_VALUE="REMOVE" +RULENAME="x_date_r2a_negative",EXTRACTION="%(reMonthLong|reMonthShort)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):VBP:" +RULENAME="x_date_r2b_negative",EXTRACTION="%(reMonthLong|reMonthShort)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):VVP:" +RULENAME="x_date_r2c_negative",EXTRACTION="%(reMonthLong|reMonthShort)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):MD:" +RULENAME="x_date_r2d1_negative",EXTRACTION="(?:may|march|fall)",NORM_VALUE="REMOVE" RULENAME="x_date_r2d2_negative",EXTRACTION="[Tt]he fall",NORM_VALUE="REMOVE" -RULENAME="x_date_r2e_negative",EXTRACTION="(March) ([Ff]or|[Aa]gainst|[Tt]o) (the )?([A-Z][\S]+)",NORM_VALUE="REMOVE" -RULENAME="x_date_r2f_negative",EXTRACTION="([Tt]he )?(Fall) ([Oo]f) (the )?([A-Z][\S]+)",NORM_VALUE="REMOVE" +RULENAME="x_date_r2e_negative",EXTRACTION="[mM]arch ([Ff]or|[Aa]gainst|[Tt]o) (the )?([A-Z]\S+)",NORM_VALUE="REMOVE" +RULENAME="x_date_r2f_negative",EXTRACTION="([Tt]he )?[Ff]all [Oo]f (the )?([A-Z]\S+)",NORM_VALUE="REMOVE" // EXAMPLE r3a_negative_1: 2000 soldiers (four digit number followed by a plural noun) // EXAMPLE r3b_negative_1: 2000 dead soldiers (four digit number followed by an adjective and a plural noun) // EXAMPLE r3c_negative_1: 2000 kilometer (four digit number followed a non-temporal unit) -RULENAME="x_date_r3a_negative",EXTRACTION="%reYear4Digit ([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):NNS:" -RULENAME="x_date_r3b_negative",EXTRACTION="%reYear4Digit ([\S]+) ([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):JJ:group(3):NNS:" -RULENAME="x_date_r3c_negative",EXTRACTION="%reYear4Digit(-| )(kilometers?|miles?|foot|feet|dollars?|percents?|millions?|mi|ft|km|%|\$)",NORM_VALUE="REMOVE" +RULENAME="x_date_r3a_negative",EXTRACTION="%reYear4Digit (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):NNS:" +RULENAME="x_date_r3b_negative",EXTRACTION="%reYear4Digit (\S+) (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):JJ:group(3):NNS:" +RULENAME="x_date_r3c_negative",EXTRACTION="%reYear4Digit[- ](?:kilometers?|miles?|foot|feet|dollars?|percents?|millions?|mi|ft|km|%|\$)",NORM_VALUE="REMOVE" // EXAMPLE r4a_negative: W2000.1920 -RULENAME="x_date_r4a_negative",EXTRACTION="[\S]+%reYear4Digit\.%reYear4Digit",NORM_VALUE="REMOVE" +RULENAME="x_date_r4a_negative",EXTRACTION="\S+%reYear4Digit\.%reYear4Digit",NORM_VALUE="REMOVE" // Telephone numbers -RULENAME="x_date_r5a_negative",EXTRACTION="(\()?\d\d\d(\))? \d\d\d-\d\d\d\d",NORM_VALUE="REMOVE" +RULENAME="x_date_r5a_negative",EXTRACTION="(?:\()?\d\d\d(\))? \d\d\d-\d\d\d\d",NORM_VALUE="REMOVE" // NEW NEGATIVE RULES: 2015-03-18 (jannik) -RULENAME="x_date_r6a_negative",EXTRACTION="([Aa]ssociation|[Dd]ocument|[Nn]umber|[Ss]ection|[Rr]esolution|HR|UNSCR|[Rr]oom|[Cc]all|[Ss]ervice at|[Pp]lan) (%reYear2Digit-)?%reYear4Digit",NORM_VALUE="REMOVE" +RULENAME="x_date_r6a_negative",EXTRACTION="(?:[Aa]ssociation|[Dd]ocument|[Nn]umber|[Ss]ection|[Rr]esolution|HR|UNSCR|[Rr]oom|[Cc]all|[Ss]ervice at|[Pp]lan) (?:%reYear2Digit-)?%reYear4Digit",NORM_VALUE="REMOVE" // address -RULENAME="x_date_r7a_negative",EXTRACTION="%reYear4Digit [A-Z]([\S]+) (Avenue|Street)",NORM_VALUE="REMOVE" +RULENAME="x_date_r7a_negative",EXTRACTION="%reYear4Digit [A-Z]\S+ (?:Avenue|Street)",NORM_VALUE="REMOVE" // abbreviations // NOT ONLY an "A" because this is likely to be a determiner -RULENAME="x_date_r8a_negative",EXTRACTION="(\b[B-Z]|\b[A-Z][A-Z][A-Z])(-| )%reYear4Digit",NORM_VALUE="REMOVE" -RULENAME="x_date_r8b_negative",EXTRACTION="(\bA)(-)%reYear4Digit",NORM_VALUE="REMOVE" +RULENAME="x_date_r8a_negative",EXTRACTION="(?:\b[B-Z]|\b[A-Z][A-Z][A-Z])[- ]%reYear4Digit",NORM_VALUE="REMOVE" +RULENAME="x_date_r8b_negative",EXTRACTION="(?:\bA)-%reYear4Digit",NORM_VALUE="REMOVE" // Money -RULENAME="x_date_r9a_negative",EXTRACTION="([Ee]uro|EUR|Dollar|\$) [\d]+(-[\d]+)?",NORM_VALUE="REMOVE" +RULENAME="x_date_r9a_negative",EXTRACTION="(?:[Ee]uro|EUR|Dollar|USD|[$€£¥¤]|GPB) \d+(?:-\d+)?",NORM_VALUE="REMOVE" // Unlikely (PAST|PRESENT|FUTURE)_REF expressions -//RULENAME="x_date_r10a_negative",EXTRACTION="([Ss]oon after)",NORM_VALUE="REMOVE" +//RULENAME="x_date_r10a_negative",EXTRACTION="[Ss]oon after",NORM_VALUE="REMOVE" // Issue # 29 - addressed Sept 16, 2015 (heideltime 2.0) // EXAMPLE"in his 20s" -RULENAME="x_date_r11a_negative",EXTRACTION="\b[Ii]n (his|her|their) \d\ds",NORM_VALUE="REMOVE" +RULENAME="x_date_r11a_negative",EXTRACTION="[Ii]n (?:his|her|their) \d\ds",NORM_VALUE="REMOVE" + +// 2017-01-18 false positives: third half-century, sixth half-hour episode +RULENAME="date_r10a_negative",EXTRACTION="\shalf-%reUnit",NORM_VALUE="REMOVE" +RULENAME="date_r10b_negative",EXTRACTION="(?:[Tt]hird|[Ff](?:ourth|ifth)|[Ss](?:ixth|eventh)) [Hh]alf",NORM_VALUE="REMOVE" diff --git a/resources/english/rules/resources_rules_durationrules.txt b/resources/english/rules/resources_rules_durationrules.txt index 3ab12cda..0e9e71d7 100644 --- a/resources/english/rules/resources_rules_durationrules.txt +++ b/resources/english/rules/resources_rules_durationrules.txt @@ -14,20 +14,21 @@ // EXAMPLE r1d-1: less than sixty minutes // EXAMPLE r1e-1: less than 60 minutes // EXAMPLE r1f-1: several minutes -RULENAME="duration_r1a",EXTRACTION="(%reApproximate |[Tt]he )?(%reNumWord2D|%reNumWord1D)( more |-| )%reUnit",NORM_VALUE="P%normDurationNumber(group(3))%normUnit4Duration(group(7))",NORM_MOD="%normApprox4Durations(group(2))" -RULENAME="duration_r1b1",EXTRACTION="(%reApproximate )?([\d]+)( more | |-)%reUnit",NORM_VALUE="Pgroup(3)%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))" -RULENAME="duration_r1b2",EXTRACTION="(%reApproximate |[Tt]he )?([\d]+)( more | )%reUnit",NORM_VALUE="Pgroup(3)%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))" -RULENAME="duration_r1c",EXTRACTION="(%reApproximate )?([Ss]everal|[Aa] couple of|[Ss]ome|[Mm]any|[Aa] few|[Rr]ecent|[Cc]oming) %reUnit",NORM_VALUE="PX%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(2))" -RULENAME="duration_r1d",EXTRACTION="(%reApproximate |[Tt]he )?(%reNumWord2D|%reNumWord1D)( more | |-)(seconds?|minutes?|hours?)",NORM_VALUE="PT%normDurationNumber(group(3))%normUnit4Duration(group(7))",NORM_MOD="%normApprox4Durations(group(2))" -RULENAME="duration_r1e1",EXTRACTION="(%reApproximate |[Tt]he )?([\d]+)( more | )(seconds?|minutes?|hours?)",NORM_VALUE="PTgroup(3)%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))" -RULENAME="duration_r1e2",EXTRACTION="(%reApproximate )?([\d]+)( more | |-)(seconds?|minutes?|hours?)",NORM_VALUE="PTgroup(3)%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))" -RULENAME="duration_r1f",EXTRACTION="(%reApproximate )?([Ss]everal|[Aa] couple of|[Ss]ome|[Mm]any|[Aa] few|[Rr]ecent|[Cc]oming) (seconds?|minutes?|hours?)",NORM_VALUE="PTX%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(2))" -RULENAME="duration_r1g1",EXTRACTION="(%reApproximate )(an?)( )%reUnit",NORM_VALUE="P1%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))" -RULENAME="duration_r1g2",EXTRACTION="([Aa]n?)( )%reUnit",NORM_VALUE="P1%normUnit4Duration(group(3))" -RULENAME="duration_r1h1",EXTRACTION="(%reApproximate )(an?)( )(second|minute|hour)",NORM_VALUE="PT1%normUnit4Duration(group(5))" -RULENAME="duration_r1h2",EXTRACTION="([Aa]n?)( )(second|minute|hour)",NORM_VALUE="PT1%normUnit4Duration(group(3))",POS_CONSTRAINT="group(3):NN:" -RULENAME="duration_r1i1",EXTRACTION="(%reApproximate )?a (hundred) %reUnit",NORM_VALUE="P100%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(2))" -RULENAME="duration_r1i2",EXTRACTION="(%reApproximate )?%reNumWord1D (hundred) %reUnit",NORM_VALUE="P%normDurationNumber(group(3))00%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))" +RULENAME="duration_r1a",EXTRACTION="(?:%reApproximate |[Tt]he )?%(reNumWord1D|reNumWord2D)(?: more | |-)%reUnit",NORM_VALUE="P%normDurationNumber(group(2))%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))" +RULENAME="duration_r1d",EXTRACTION="(?:%reApproximate |[Tt]he )?%(reNumWord1D|reNumWord2D)(?: more | |-)(seconds?|minutes?|hours?)",NORM_VALUE="PT%normDurationNumber(group(2))%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))" +RULENAME="duration_r1b1",EXTRACTION="(?:%reApproximate )?(\d+)(?: more | |-)%reUnit",NORM_VALUE="Pgroup(2)%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))" +RULENAME="duration_r1b2",EXTRACTION="(?:%reApproximate )?(\d+)(?: more | |-)(seconds?|minutes?|hours?)",NORM_VALUE="PTgroup(2)%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))" +RULENAME="duration_r1e1",EXTRACTION="(?:%reApproximate |[Tt]he )?(\d+) (?:more )?%reUnit",NORM_VALUE="Pgroup(2)%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))" +RULENAME="duration_r1e2",EXTRACTION="(?:%reApproximate |[Tt]he )?(\d+) (?:more )?(seconds?|minutes?|hours?)",NORM_VALUE="PTgroup(2)%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))" +RULENAME="duration_r1c",EXTRACTION="(?:%reApproximate )?(?:[Ss](?:everal|ome)|[Aa] (?:couple of|few)|[Mm]any|[Rr]ecent|[Cc]oming) %reUnit",NORM_VALUE="PX%normUnit4Duration(group(2))",NORM_MOD="%normApprox4Durations(group(1))" +RULENAME="duration_r1f",EXTRACTION="(?:%reApproximate )?(?:[Ss](?:everal|ome)|[Aa] (?:couple of|few)|[Mm]any|[Rr]ecent|[Cc]oming) (seconds?|minutes?|hours?)",NORM_VALUE="PTX%normUnit4Duration(group(2))",NORM_MOD="%normApprox4Durations(group(1))" + +RULENAME="duration_r1g1",EXTRACTION="%reApproximate an? %reUnit",NORM_VALUE="P1%normUnit4Duration(group(2))",NORM_MOD="%normApprox4Durations(group(1))" +RULENAME="duration_r1g2",EXTRACTION="%reApproximate an? (second|minute|hour)",NORM_VALUE="PT1%normUnit4Duration(group(2))",NORM_MOD="%normApprox4Durations(group(1))" +RULENAME="duration_r1h1",EXTRACTION="(?:[Aa]n?) %reUnit",NORM_VALUE="P1%normUnit4Duration(group(1))" +RULENAME="duration_r1h2",EXTRACTION="(?:[Aa]n?) (second|minute|hour)",NORM_VALUE="PT1%normUnit4Duration(group(1))",POS_CONSTRAINT="group(1):NN:" +RULENAME="duration_r1i1",EXTRACTION="(?:%reApproximate )?a hundred %reUnit",NORM_VALUE="P100%normUnit4Duration(group(2))",NORM_MOD="%normApprox4Durations(group(1))" +RULENAME="duration_r1i2",EXTRACTION="(?:%reApproximate )?%reNumWord1D hundred %reUnit",NORM_VALUE="P%normDurationNumber(group(2))00%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))" // duration_r2 // EXAMPLE r2a-1: at least the last twenty years @@ -36,37 +37,37 @@ RULENAME="duration_r1i2",EXTRACTION="(%reApproximate )?%reNumWord1D (hundred) %r // EXAMPLE r2d-1: at least the last twenty minutes // EXAMPLE r2e-1: at least the last 20 minutes // EXAMPLE r2f-1: at least the last several minutes -RULENAME="duration_r2a",EXTRACTION="(%reApproximate )?[Tt]he %reThisNextLast (%reNumWord2D|%reNumWord1D) %reUnit( or so)?",NORM_VALUE="P%normDurationNumber(group(4))%normUnit4Duration(group(7))",NORM_MOD="%normApprox4Durations(group(2))" -RULENAME="duration_r2b",EXTRACTION="(%reApproximate )?[Tt]he %reThisNextLast ([\d]+) %reUnit( or so)?",NORM_VALUE="Pgroup(4)%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))" -RULENAME="duration_r2c",EXTRACTION="(%reApproximate )?[Tt]he %reThisNextLast (several|couple of|few) %reUnit( or so)?",NORM_VALUE="PX%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))" -RULENAME="duration_r2d",EXTRACTION="(%reApproximate )?[Tt]he %reThisNextLast (%reNumWord2D|%reNumWord1D) (seconds?|minutes?|hours?)( or so)?",NORM_VALUE="PT%normDurationNumber(group(4))%normUnit4Duration(group(7))",NORM_MOD="%normApprox4Durations(group(2))" -RULENAME="duration_r2e",EXTRACTION="(%reApproximate )?[Tt]he %reThisNextLast ([\d]+) (seconds?|minutes?|hours?)( or so)?",NORM_VALUE="PTgroup(4)%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))" -RULENAME="duration_r2f",EXTRACTION="(%reApproximate )?[Tt]he %reThisNextLast (several|couple of|few) (seconds?|minutes?|hours?)( or so)?",NORM_VALUE="PTX%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))" +RULENAME="duration_r2a",EXTRACTION="(?:%reApproximate )?[Tt]he %reThisNextLast %(reNumWord1D|reNumWord2D) %reUnit(?: or so)?",NORM_VALUE="P%normDurationNumber(group(3))%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(1))" +RULENAME="duration_r2b",EXTRACTION="(?:%reApproximate )?[Tt]he %reThisNextLast (\d+) %reUnit(?: or so)?",NORM_VALUE="Pgroup(3)%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(1))" +RULENAME="duration_r2c",EXTRACTION="(?:%reApproximate )?[Tt]he %reThisNextLast (?:several|couple of|few) %reUnit(?: or so)?",NORM_VALUE="PX%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))" +RULENAME="duration_r2d",EXTRACTION="(?:%reApproximate )?[Tt]he %reThisNextLast %(reNumWord1D|reNumWord2D) (seconds?|minutes?|hours?)(?: or so)?",NORM_VALUE="PT%normDurationNumber(group(3))%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(1))" +RULENAME="duration_r2e",EXTRACTION="(?:%reApproximate )?[Tt]he %reThisNextLast (\d+) (seconds?|minutes?|hours?)(?: or so)?",NORM_VALUE="PTgroup(3)%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(1))" +RULENAME="duration_r2f",EXTRACTION="(?:%reApproximate )?[Tt]he %reThisNextLast (?:several|couple of|few) (seconds?|minutes?|hours?)(?: or so)?",NORM_VALUE="PTX%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))" // duration_r3 // EXAMPLE r3a-1: a three-year period // EXAMPLE r3b-1: a 300 year period // EXAMPLE r3c-1: a three-hour period // EXAMPLE r3d-1: a 300 hour period -RULENAME="duration_r3a",EXTRACTION="(([Aa]n?|[Tt]he) )?(%reNumWord2D|%reNumWord1D)( |-)%reUnit (period|term)",NORM_VALUE="P%normDurationNumber(group(3))%normUnit4Duration(group(7))" -RULENAME="duration_r3b",EXTRACTION="(([Aa]n?|[Tt]he) )?([\d]+)( |-)%reUnit (period|term)",NORM_VALUE="Pgroup(3)%normUnit4Duration(group(5))" -RULENAME="duration_r3c",EXTRACTION="(([Aa]n?|[Tt]he) )?(%reNumWord2D|%reNumWord1D)( |-)(seconds?|minutes?|hours?) (period|term)",NORM_VALUE="PT%normDurationNumber(group(3))%normUnit4Duration(group(7))" -RULENAME="duration_r3d",EXTRACTION="(([Aa]n?|[Tt]he) )?([\d]+)( |-)(seconds?|minutes?|hours?) (period|term)",NORM_VALUE="PTgroup(3)%normUnit4Duration(group(5))" +RULENAME="duration_r3a",EXTRACTION="(?:[Aa]n? |[Tt]he |)%(reNumWord1D|reNumWord2D)[ -]%reUnit (?:period|term)",NORM_VALUE="P%normDurationNumber(group(1))%normUnit4Duration(group(2))" +RULENAME="duration_r3b",EXTRACTION="(?:[Aa]n? |[Tt]he |)(\d+)[ -]%reUnit (?:period|term)",NORM_VALUE="Pgroup(1)%normUnit4Duration(group(2))" +RULENAME="duration_r3c",EXTRACTION="(?:[Aa]n? |[Tt]he |)%(reNumWord1D|reNumWord2D)[ -](seconds?|minutes?|hours?) (?:period|term)",NORM_VALUE="PT%normDurationNumber(group(1))%normUnit4Duration(group(2))" +RULENAME="duration_r3d",EXTRACTION="(?:[Aa]n? |[Tt]he |)(\d+)[ -](seconds?|minutes?|hours?) (?:period|term)",NORM_VALUE="PTgroup(1)%normUnit4Duration(group(2))" // duration_r4 -RULENAME="duration_r4a",EXTRACTION="(([Aa]n?)( |-)%reUnit) after",NORM_VALUE="P1%normUnit4Duration(group(4))",OFFSET="group(1)-group(1)" -RULENAME="duration_r4b",EXTRACTION="(([Aa]n?)( |-)(seconds?|minutes?|hours?)) after",NORM_VALUE="PT1%normUnit4Duration(group(4))",OFFSET="group(1)-group(1)" +RULENAME="duration_r4a",EXTRACTION="([Aa]n?[ -]%reUnit) after",NORM_VALUE="P1%normUnit4Duration(group(2))",OFFSET="group(1)-group(1)" +RULENAME="duration_r4b",EXTRACTION="([Aa]n?[ -](seconds?|minutes?|hours?)) after",NORM_VALUE="PT1%normUnit4Duration(group(2))",OFFSET="group(1)-group(1)" // duration_r5 // EXAMPLE: r5_a: two and six days (find "two") -RULENAME="duration_r5a1",EXTRACTION="(%reApproximate )(%reNumWord2D|%reNumWord1D)( to | or | and |-)(%reNumWord2D|%reNumWord1D) %reUnit",NORM_VALUE="P%normDurationNumber(group(3))%normUnit4Duration(group(10))",NORM_MOD="%normApprox4Durations(group(2))",OFFSET="group(1)-group(3)" -RULENAME="duration_r5a2",EXTRACTION="(%reNumWord2D|%reNumWord1D)( to | or | and |-)(%reNumWord2D|%reNumWord1D) %reUnit",NORM_VALUE="P%normDurationNumber(group(1))%normUnit4Duration(group(8))",OFFSET="group(1)-group(1)" -RULENAME="duration_r5b1",EXTRACTION="(%reApproximate )([\d]+)( to | or | and |-)([\d]+) %reUnit",NORM_VALUE="Pgroup(3)%normUnit4Duration(group(6))",NORM_MOD="%normApprox4Durations(group(2))",OFFSET="group(1)-group(3)" -RULENAME="duration_r5b2",EXTRACTION="([\d]+)( to | or | and |-)([\d]+) %reUnit",NORM_VALUE="Pgroup(1)%normUnit4Duration(group(4))",OFFSET="group(1)-group(1)" -RULENAME="duration_r5c1",EXTRACTION="(%reApproximate )(%reNumWord2D|%reNumWord1D)( to | or | and |-)(%reNumWord2D|%reNumWord1D) (seconds?|minutes?|hours?)",NORM_VALUE="PT%normDurationNumber(group(3))%normUnit4Duration(group(10))",NORM_MOD="%normApprox4Durations(group(2))",OFFSET="group(1)-group(3)" -RULENAME="duration_r5c2",EXTRACTION="(%reNumWord2D|%reNumWord1D)( to | or | and |-)(%reNumWord2D|%reNumWord1D) (seconds?|minutes?|hours?)",NORM_VALUE="PT%normDurationNumber(group(1))%normUnit4Duration(group(8))",OFFSET="group(1)-group(1)" -RULENAME="duration_r5d1",EXTRACTION="(%reApproximate )([\d]+)( to | or | and |-)([\d]+) (seconds?|minutes?|hours?)",NORM_VALUE="PTgroup(3)%normUnit4Duration(group(6))",NORM_MOD="%normApprox4Durations(group(2))",OFFSET="group(1)-group(3)" -RULENAME="duration_r5d2",EXTRACTION="([\d]+)( to | or | and |-)([\d]+) (seconds?|minutes?|hours?)",NORM_VALUE="PTgroup(1)%normUnit4Duration(group(4))",OFFSET="group(1)-group(1)" +RULENAME="duration_r5a1",EXTRACTION="%reApproximate %(reNumWord1D|reNumWord2D)(?: to | or | and |-)%(reNumWord1D|reNumWord2D) %reUnit",NORM_VALUE="P%normDurationNumber(group(2))%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(1))",OFFSET="group(1)-group(2)" +RULENAME="duration_r5b1",EXTRACTION="%(reNumWord1D|reNumWord2D)(?: to | or | and |-)%(reNumWord1D|reNumWord2D) %reUnit",NORM_VALUE="P%normDurationNumber(group(1))%normUnit4Duration(group(3))",OFFSET="group(1)-group(1)" +RULENAME="duration_r5c1",EXTRACTION="%reApproximate (\d+)(?: to | or | and |-)(\d+) %reUnit",NORM_VALUE="Pgroup(2)%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(1))",OFFSET="group(1)-group(2)" +RULENAME="duration_r5d1",EXTRACTION="(\d+)(?: to | or | and |-)(\d+) %reUnit",NORM_VALUE="Pgroup(1)%normUnit4Duration(group(3))",OFFSET="group(1)-group(1)" +RULENAME="duration_r5a2",EXTRACTION="%reApproximate %(reNumWord1D|reNumWord2D)(?: to | or | and |-)%(reNumWord1D|reNumWord2D) (seconds?|minutes?|hours?)",NORM_VALUE="PT%normDurationNumber(group(2))%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(1))",OFFSET="group(1)-group(2)" +RULENAME="duration_r5b2",EXTRACTION="%(reNumWord1D|reNumWord2D)(?: to | or | and |-)%(reNumWord1D|reNumWord2D) (seconds?|minutes?|hours?)",NORM_VALUE="PT%normDurationNumber(group(1))%normUnit4Duration(group(3))",OFFSET="group(1)-group(1)" +RULENAME="duration_r5c2",EXTRACTION="%reApproximate (\d+)(?: to | or | and |-)(\d+) (seconds?|minutes?|hours?)",NORM_VALUE="PTgroup(2)%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(1))",OFFSET="group(1)-group(2)" +RULENAME="duration_r5d2",EXTRACTION="(\d+)(?: to | or | and |-)(\d+) (seconds?|minutes?|hours?)",NORM_VALUE="PTgroup(1)%normUnit4Duration(group(3))",OFFSET="group(1)-group(1)" //////////////////// // NEGATIVE RULES // @@ -75,12 +76,11 @@ RULENAME="duration_r5d2",EXTRACTION="([\d]+)( to | or | and |-)([\d]+) (seconds? // EXAMPLE r1a_negative-1: about 200 years older // EXAMPLE r1b_negative-1: several days old // EXAMPLE r1c_negative-1: 59-year-old -RULENAME="duration_r1a_negation",EXTRACTION="(%reApproximate |[Tt]he )?(%reNumWord2D|%reNumWord1D|[\d]+) (%reUnit|minutes?|hours?) (older|younger|old|young)",NORM_VALUE="REMOVE" -RULENAME="duration_r1b_negation",EXTRACTION="(%reApproximate |[Tt]he )?([Ss]everal|[Aa] couple of|[Ss]ome|[Mm]any|[Aa] few|[Rr]ecent|[Cc]oming) (%reUnit|minutes?|hours?) (older|younger|old|young)",NORM_VALUE="REMOVE" -RULENAME="duration_r1c_negation",EXTRACTION="([Tt]he )?(%reNumWord2D|%reNumWord1D|[\d]+)-(%reUnit|minutes?|hours?)-(older|younger|old|young)",NORM_VALUE="REMOVE" -RULENAME="duration_r1d_negation",EXTRACTION="(%reApproximate )?(an|a)( )%reUnit-([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(5):NN:" +RULENAME="duration_r1a_negation",EXTRACTION="(?:%reApproximate |[Tt]he |)(?:%(reNumWord1D|reNumWord2D)|[\d]+) %reUnitFine (?:older|younger|old|young)",NORM_VALUE="REMOVE" +RULENAME="duration_r1b_negation",EXTRACTION="(?:%reApproximate |[Tt]he |)(?:[Ss]everal|[Aa] couple of|[Ss]ome|[Mm]any|[Aa] few|[Rr]ecent|[Cc]oming) %reUnitFine (?:older|younger|old|young)",NORM_VALUE="REMOVE" +RULENAME="duration_r1c_negation",EXTRACTION="(?:[Tt]he |)(?:%(reNumWord1D|reNumWord2D)|\d+)-%reUnitFine-(?:older|younger|old|young)",NORM_VALUE="REMOVE" +RULENAME="duration_r1d_negation",EXTRACTION="(?:%reApproximate |)an? %reUnit-(?:\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(4):NN:" // NEW NEGATIVE RULES: 2015-03-18 (jannik) -RULENAME="duration_r2a_negation",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D)(-| )quarters?",NORM_VALUE="REMOVE" -RULENAME="duration_r2b_negation",EXTRACTION="(%reApproximate )?(a) quarter",NORM_VALUE="REMOVE" \ No newline at end of file +RULENAME="duration_r2a_negation",EXTRACTION="(?:%reApproximate )?(?:%(reNumWord1D|reNumWord2D)|a)[- ]quarters?",NORM_VALUE="REMOVE" diff --git a/resources/english/rules/resources_rules_setrules.txt b/resources/english/rules/resources_rules_setrules.txt index f4e4d38b..51c8eba3 100644 --- a/resources/english/rules/resources_rules_setrules.txt +++ b/resources/english/rules/resources_rules_setrules.txt @@ -14,7 +14,7 @@ // EXAMPLE r1d-1: every summer RULENAME="set_r1a",EXTRACTION="([Ee]very|[Ee]ach) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(2)),0,1))",NORM_QUANT="%UPPERCASE%(group(1))" RULENAME="set_r1b",EXTRACTION="([Ee]very|[Ee]ach) %reWeekday",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(2))",NORM_QUANT="%UPPERCASE%(group(1))",NORM_FREQ="1W" -RULENAME="set_r1c",EXTRACTION="([Ee]very|[Ee]ach) (%reMonthLong|%reMonthShort)",NORM_VALUE="XXXX-%normMonth(group(2))",NORM_QUANT="%UPPERCASE%(group(1))",NORM_FREQ="1M" +RULENAME="set_r1c",EXTRACTION="([Ee]very|[Ee]ach) %(reMonthLong|reMonthShort)",NORM_VALUE="XXXX-%normMonth(group(2))",NORM_QUANT="%UPPERCASE%(group(1))",NORM_FREQ="1M" RULENAME="set_r1d",EXTRACTION="([Ee]very|[Ee]ach) %reSeason",NORM_VALUE="XXXX-%normSeason(group(2))",NORM_QUANT="%UPPERCASE%(group(1))",NORM_FREQ="1S" // set_r2 @@ -24,22 +24,22 @@ RULENAME="set_r1d",EXTRACTION="([Ee]very|[Ee]ach) %reSeason",NORM_VALUE="XXXX-%n // EXAMPLE r2d-1: 40 times per month // EXAMPLE r2e-1: a month // EXAMPLE r2f-1: a minute -RULENAME="set_r2a",EXTRACTION="[Oo]nce (a|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(2)),0,1))",NORM_FREQ="1" -RULENAME="set_r2b",EXTRACTION="[Tt]wice (a|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(2)),0,1))",NORM_FREQ="2" -RULENAME="set_r2c",EXTRACTION="(%reNumWord1D|%reNumWord2D) times? (a|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(5)),0,1))",NORM_FREQ="%normDurationNumber(group(1))" -RULENAME="set_r2d",EXTRACTION="([\d]+) times? (a|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(3)),0,1))",NORM_FREQ="group(1)" -RULENAME="set_r2e",EXTRACTION="(a|an)( |-)%reUnit",NORM_VALUE="P1%normUnit4Duration(group(3))",NORM_FREQ="1" -RULENAME="set_r2f",EXTRACTION="(a|an)( |-)(minutes?|hours?)",NORM_VALUE="PT1%normUnit4Duration(group(3))",NORM_FREQ="1" +RULENAME="set_r2a",EXTRACTION="[Oo]nce (?:a|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(1)),0,1))",NORM_FREQ="1" +RULENAME="set_r2b",EXTRACTION="[Tt]wice (?:a|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(1)),0,1))",NORM_FREQ="2" +RULENAME="set_r2c",EXTRACTION="%(reNumWord1D|reNumWord2D) times? (?:a|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(2)),0,1))",NORM_FREQ="%normDurationNumber(group(1))" +RULENAME="set_r2d",EXTRACTION="([\d]+) times? (?:a|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(2)),0,1))",NORM_FREQ="group(1)" +RULENAME="set_r2e",EXTRACTION="(?:a|an)[\s-]%reUnit",NORM_VALUE="P1%normUnit4Duration(group(1))",NORM_FREQ="1" +RULENAME="set_r2f",EXTRACTION="(?:a|an)[\s-](minutes?|hours?)",NORM_VALUE="PT1%normUnit4Duration(group(1))",NORM_FREQ="1" // set_r3 // EXAMPLE r3a-1: every 5 years // EXAMPLE r3b-1: every two days -RULENAME="set_r3a",EXTRACTION="([Ee]very) ([\d]+) %reUnit",NORM_VALUE="Pgroup(2)%UPPERCASE%(%SUBSTRING%(%normUnit(group(3)),0,1))",NORM_QUANT="%UPPERCASE%(group(1))" -RULENAME="set_r3b",EXTRACTION="([Ee]very) (%reNumWord1D|%reNumWord2D) %reUnit",NORM_VALUE="P%normDurationNumber(group(2))%UPPERCASE%(%SUBSTRING%(%normUnit(group(5)),0,1))",NORM_QUANT="%UPPERCASE%(group(1))" +RULENAME="set_r3a",EXTRACTION="([Ee]very) (\d+) %reUnit",NORM_VALUE="Pgroup(2)%UPPERCASE%(%SUBSTRING%(%normUnit(group(3)),0,1))",NORM_QUANT="%UPPERCASE%(group(1))" +RULENAME="set_r3b",EXTRACTION="([Ee]very) %(reNumWord1D|reNumWord2D) %reUnit",NORM_VALUE="P%normDurationNumber(group(2))%UPPERCASE%(%SUBSTRING%(%normUnit(group(3)),0,1))",NORM_QUANT="%UPPERCASE%(group(1))" // set_r4 // EXAMPLE r4a-1: 2 days each week -RULENAME="set_r4a",EXTRACTION="([\d]+) %reUnit (each|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(4)),0,1))",NORM_QUANT="EACH",NORM_FREQ="group(1)%normUnit(group(2))" +RULENAME="set_r4a",EXTRACTION="([\d]+) %reUnit (?:each|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(3)),0,1))",NORM_QUANT="EACH",NORM_FREQ="group(1)%normUnit(group(2))" // set_r5 // EXAMPLE r5a-1: annually @@ -52,5 +52,5 @@ RULENAME="set_r5a",EXTRACTION="%reSetWords",NORM_VALUE="%normSetWords(group(1))" // EXAMPLE r6a-1: Monday afternoons // EXAMPLE r6b-1: Monday and Tuesday nights (find: Monday nights) RULENAME="set_r6a",EXTRACTION="%reWeekday %rePartOfDay[s]",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(1))T%normPartOfDay(group(2))",NORM_FREQ="1W" -RULENAME="set_r6b",EXTRACTION="%reWeekday (and|or) %reWeekday %rePartOfDay[s]",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(1))T%normPartOfDay(group(4))",NORM_FREQ="1W",OFFSET="group(1)-group(1)" +RULENAME="set_r6b",EXTRACTION="%reWeekday (?:and|or) %reWeekday %rePartOfDay[s]",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(1))T%normPartOfDay(group(3))",NORM_FREQ="1W",OFFSET="group(1)-group(1)" diff --git a/resources/english/rules/resources_rules_timerules.txt b/resources/english/rules/resources_rules_timerules.txt index 6d77952c..84a1dc63 100644 --- a/resources/english/rules/resources_rules_timerules.txt +++ b/resources/english/rules/resources_rules_timerules.txt @@ -14,12 +14,12 @@ // EXAMPLE r1c-1: 12/29/2000 20:29 // EXAMPLE r1d-1: 12/29/2000 20:29:29 // EXAMPLE r1e-1: 12/29/2000 20:29:29.79 -RULENAME="time_r1a",EXTRACTION="(%reYear4Digit-%reMonthNumber-%reDayNumber)(T| )%reTimeHour:%reTimeMinute:%reTimeMinute",NORM_VALUE="group(2)-group(3)-group(4)Tgroup(6):group(7):group(8)" -RULENAME="time_r1b",EXTRACTION="(%reYear4Digit-%reMonthNumber-%reDayNumber)(T| )%reTimeHour:%reTimeMinute",NORM_VALUE="group(2)-group(3)-group(4)Tgroup(6):group(7)" -RULENAME="time_r1c",EXTRACTION="(%reMonthNumber/%reDayNumber/%reYear4Digit) %reTimeHour:%reTimeMinute",NORM_VALUE="group(4)-group(2)-group(3)Tgroup(5):group(6)" -RULENAME="time_r1d",EXTRACTION="(%reMonthNumber/%reDayNumber/%reYear4Digit) %reTimeHour:%reTimeMinute:%reTimeMinute",NORM_VALUE="group(4)-group(2)-group(3)Tgroup(5):group(6):group(7)" -RULENAME="time_r1e",EXTRACTION="(%reMonthNumber/%reDayNumber/%reYear4Digit) %reTimeHour:%reTimeMinute:%reTimeMinute\.%reYear2Digit",NORM_VALUE="group(4)-group(2)-group(3)Tgroup(5):group(6):group(7).group(8)" -RULENAME="time_r1f",EXTRACTION="%reYear4Digit%reMonthNumber%reDayNumber-%reTimeHour(?:-|:)%reTimeMinute(?:-|:)%reTimeMinute",NORM_VALUE="group(1)-group(2)-group(3)Tgroup(4):group(5):group(6)" +RULENAME="time_r1a",EXTRACTION="%reYear4Digit-%reMonthNumber-%reDayNumber[T ]%reTimeHour:%reTimeMinute:%reTimeMinute",NORM_VALUE="group(1)-group(2)-group(3)Tgroup(4):group(5):group(6)" +RULENAME="time_r1b",EXTRACTION="%reYear4Digit-%reMonthNumber-%reDayNumber[T ]%reTimeHour:%reTimeMinute",NORM_VALUE="group(1)-group(2)-group(3)Tgroup(4):group(5)" +RULENAME="time_r1c",EXTRACTION="%reMonthNumber/%reDayNumber/%reYear4Digit %reTimeHour:%reTimeMinute",NORM_VALUE="group(3)-group(1)-group(2)Tgroup(4):group(5)" +RULENAME="time_r1d",EXTRACTION="%reMonthNumber/%reDayNumber/%reYear4Digit %reTimeHour:%reTimeMinute:%reTimeMinute",NORM_VALUE="group(3)-group(1)-group(2)Tgroup(4):group(5):group(6)" +RULENAME="time_r1e",EXTRACTION="%reMonthNumber/%reDayNumber/%reYear4Digit %reTimeHour:%reTimeMinute:%reTimeMinute\.%reYear2Digit",NORM_VALUE="group(3)-group(1)-group(2)Tgroup(4):group(5):group(6).group(7)" +RULENAME="time_r1f",EXTRACTION="%reYear4Digit%reMonthNumber%reDayNumber-%reTimeHour[-:]%reTimeMinute[-:]%reTimeMinute",NORM_VALUE="group(1)-group(2)-group(3)Tgroup(4):group(5):group(6)" // time_r2 // EXAMPLE r2a-1: 09-24-99 1145EST (TimeStamp style with timezone information) @@ -27,10 +27,10 @@ RULENAME="time_r1f",EXTRACTION="%reYear4Digit%reMonthNumber%reDayNumber-%reTimeH // EXAMPLE r2c-1: Wed, 29 Dec 2004 00:28:16 +0000 // EXAMPLE r2d-1: Sat, 29 Jan 2005 17:21:13 -0600 // EXAMPLE r2d-2: 1 Feb 2005 16:13:33 +1300 -RULENAME="time_r2a",EXTRACTION="(%reMonthNumber-%reDayNumber-%reYear2Digit)( %reTimeHour(:)?%reTimeMinute)%reTimezone",NORM_VALUE="UNDEF-centurygroup(4)-group(2)-group(3)T%normMinute(group(6)):%normMinute(group(8))%normTimezone(group(9))" -RULENAME="time_r2b",EXTRACTION="%reMonthLong %reDayNumber, %reYear4Digit %reTimeHour(:)?%reTimeMinute %reTimezone",NORM_VALUE="group(3)-%normMonth(group(1))-%normDay(group(2))T%normMinute(group(4)):%normMinute(group(6))" -RULENAME="time_r2c",EXTRACTION="((Mon|Tue|Wed|Thu|Fri|Sat|Sun), )?%reDayNumber (%reMonthLong|%reMonthShort) %reYear4Digit %reTimeHour:%reTimeMinute:%reTimeMinute (\+|-)0000",NORM_VALUE="group(7)-%normMonth(group(4))-%normDay(group(3))Tgroup(8):group(9):group(10)" -RULENAME="time_r2d",EXTRACTION="((Mon|Tue|Wed|Thu|Fri|Sat|Sun), )?%reDayNumber (%reMonthLong|%reMonthShort) %reYear4Digit %reTimeHour:%reTimeMinute:%reTimeMinute (\+|-)(\d\d)00",NORM_VALUE="group(7)-%normMonth(group(4))-%normDay(group(3))Tgroup(8):group(9):group(10)group(11)group(12)" +RULENAME="time_r2a",EXTRACTION="%reMonthNumber-%reDayNumber-%reYear2Digit %reTimeHour:?%reTimeMinute%reTimezone",NORM_VALUE="UNDEF-centurygroup(3)-group(1)-group(2)T%normMinute(group(4)):%normMinute(group(5))%normTimezone(group(6))" +RULENAME="time_r2b",EXTRACTION="%reMonthLong %reDayNumber, %reYear4Digit %reTimeHour:?%reTimeMinute %reTimezone",NORM_VALUE="group(3)-%normMonth(group(1))-%normDay(group(2))T%normMinute(group(4)):%normMinute(group(5))" +// matched by r2d RULENAME="time_r2c",EXTRACTION="(?:Mon, |Tue, |Wed, |Thu, |Fri, |Sat, |Sun, )?%reDayNumber %(reMonthLong|reMonthShort) %reYear4Digit %reTimeHour:%reTimeMinute:%reTimeMinute (\+|-)0000",NORM_VALUE="group(3)-%normMonth(group(2))-%normDay(group(1))Tgroup(4):group(5):group(6)" +RULENAME="time_r2d",EXTRACTION="(?:Mon, |Tue, |Wed, |Thu, |Fri, |Sat, |Sun, )?%reDayNumber %(reMonthLong|reMonthShort) %reYear4Digit %reTimeHour:%reTimeMinute:%reTimeMinute (\+|-)(\d\d)00",NORM_VALUE="group(3)-%normMonth(group(2))-%normDay(group(1))Tgroup(4):group(5):group(6)group(7)group(8)" RULENAME="time_r2e",EXTRACTION="(%reMonthLong %reDayNumber, %reTimeHour %reTimezone) in %reYear4Digit",NORM_VALUE="group(6)-%normMonth(group(2))-%normDay(group(3))T%normMinute(group(4)):00",OFFSET="group(1)-group(1)" RULENAME="time_r2f",EXTRACTION="(%reMonthLong %reDayNumber, %reTimeHour:%reTimeMinute%reTimezone) in %reYear4Digit",NORM_VALUE="group(7)-%normMonth(group(2))-%normDay(group(3))T%normMinute(group(4)):%normMinute(group(5))",OFFSET="group(1)-group(1)" @@ -59,7 +59,7 @@ RULENAME="time_r3e",EXTRACTION="%reThisNextLast %reWeekday %rePartOfDay",NORM_VA // EXAMPLE r4a-1: earlier this afternoon // EXAMPLE r4a-2: later last night // EXAMPLE r4b-1: tonight -RULENAME="time_r4a",EXTRACTION="(([Ee]arlier|[Ll]ater|[Ee]arly) )?%reThisNextLast %rePartOfDay",NORM_VALUE="UNDEF-%normThisNextLast(group(3))-dayT%normPartOfDay(group(4))" +RULENAME="time_r4a",EXTRACTION="(?:[Ee]arlier |[Ll]ater |[Ee]arly )?%reThisNextLast %rePartOfDay",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-dayT%normPartOfDay(group(2))" RULENAME="time_r4b",EXTRACTION="([Tt]onight)",NORM_VALUE="UNDEF-this-dayT%normPartOfDay(group(1))" /////////////////////////// @@ -72,15 +72,15 @@ RULENAME="time_r4b",EXTRACTION="([Tt]onight)",NORM_VALUE="UNDEF-this-dayT%normPa // EXAMPLE r5c-1: 11:30 a.m. // EXAMPLE r5d-1: 9:30 p.m. // EXAMPLE r5e-1: 10:30:34 a.m. -// EXAMPLE r5e-1: 10:30:34 p.m. -RULENAME="time_r5a",EXTRACTION="(%reApproximate )?%reTimeHour[\s]*[Aa][\.]?[Mm][\.]?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(3)):00",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="time_r5b",EXTRACTION="(%reApproximate )?%reTimeHour[\s]*[Pp][\.]?[Mm][\.]?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(3)),12):00",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="time_r5c",EXTRACTION="(%reApproximate )?%reTimeHour:%reTimeMinute[\s]*[Aa][\.]?[Mm][\.]?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(3)):group(4)",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="time_r5d",EXTRACTION="(%reApproximate )?%reTimeHour:%reTimeMinute[\s]*[Pp][\.]?[Mm][\.]?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(3)),12):group(4)",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="time_r5e",EXTRACTION="(%reApproximate )?%reTimeHour:%reTimeMinute:%reTimeMinute[\s]*[Aa][\.]?[Mm][\.]?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(3)):group(4)",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="time_r5f",EXTRACTION="(%reApproximate )?%reTimeHour:%reTimeMinute:%reTimeMinute[\s]*[Pp][\.]?[Mm][\.]?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(3)),12):group(4)",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="time_r5g",EXTRACTION="(%reApproximate )?%reTimeHour%reTimeMinute[\s]*[Aa][\.]?[Mm][\.]?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(3)):group(4)",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="time_r5h",EXTRACTION="(%reApproximate )?%reTimeHour%reTimeMinute[\s]*[Pp][\.]?[Mm][\.]?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(3)),12):group(4)",NORM_MOD="%normApprox4Dates(group(2))" +// EXAMPLE r5f-1: 10:30:34 p.m. +RULENAME="time_r5a",EXTRACTION="(?:%reApproximate )?%reTimeHour\s*[Aa]\.?[Mm]\.?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):00",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="time_r5b",EXTRACTION="(?:%reApproximate )?%reTimeHour\s*[Pp]\.?[Mm]\.?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(2)),12):00",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="time_r5c",EXTRACTION="(?:%reApproximate )?%reTimeHour:%reTimeMinute\s*[Aa]\.?[Mm]\.?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):group(3)",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="time_r5d",EXTRACTION="(?:%reApproximate )?%reTimeHour:%reTimeMinute\s*[Pp]\.?[Mm]\.?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(2)),12):group(3)",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="time_r5e",EXTRACTION="(?:%reApproximate )?%reTimeHour:%reTimeMinute:%reTimeMinute\s*[Aa]\.?[Mm]\.?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):group(3):group(4)",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="time_r5f",EXTRACTION="(?:%reApproximate )?%reTimeHour:%reTimeMinute:%reTimeMinute\s*[Pp]\.?[Mm]\.?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(2)),12):group(3):group(4)",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="time_r5g",EXTRACTION="(?:%reApproximate )?%reTimeHour%reTimeMinute\s*[Aa]\.?[Mm]\.?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):group(3)",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="time_r5h",EXTRACTION="(?:%reApproximate )?%reTimeHour%reTimeMinute\s*[Pp]\.?[Mm]\.?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(2)),12):group(3)",NORM_MOD="%normApprox4Dates(group(1))" // time_r6 @@ -89,24 +89,24 @@ RULENAME="time_r5h",EXTRACTION="(%reApproximate )?%reTimeHour%reTimeMinute[\s]*[ // EXAMPLE r6b-1: 9 pm Wednesday // EXAMPLE r6c-1: 9:30 a.m. Wednesday // EXAMPLE r6d-1: 9:30 p.m. Wednesday -RULENAME="time_r6a",EXTRACTION="(%reApproximate )?%reTimeHour[\s]*[Aa][\.]?[Mm][\.]? %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(4))T%normDay(group(3)):00",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="time_r6b",EXTRACTION="(%reApproximate )?%reTimeHour[\s]*[Pp][\.]?[Mm][\.]? %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(4))T%SUM%(%normDay(group(3)),12):00",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="time_r6c",EXTRACTION="(%reApproximate )?%reTimeHour:%reTimeMinute[\s]*[Aa][\.]?[Mm][\.]? %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(5))T%normDay(group(3)):group(4)",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="time_r6d",EXTRACTION="(%reApproximate )?%reTimeHour:%reTimeMinute[\s]*[Pp][\.]?[Mm][\.]? %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(5))T%SUM%(%normDay(group(3)),12):group(4)",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="time_r6e",EXTRACTION="(%reApproximate )?%reTimeHour:%reTimeMinute %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(5))T%normDay(group(3)):group(4)",NORM_MOD="%normApprox4Dates(group(2))" +RULENAME="time_r6a",EXTRACTION="(?:%reApproximate )?%reTimeHour\s*[Aa]\.?[Mm]\.? %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(3))T%normDay(group(2)):00",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="time_r6b",EXTRACTION="(?:%reApproximate )?%reTimeHour\s*[Pp]\.?[Mm]\.? %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(3))T%SUM%(%normDay(group(2)),12):00",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="time_r6c",EXTRACTION="(?:%reApproximate )?%reTimeHour:%reTimeMinute\s*[Aa]\.?[Mm]\.? %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(4))T%normDay(group(2)):group(3)",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="time_r6d",EXTRACTION="(?:%reApproximate )?%reTimeHour:%reTimeMinute\s*[Pp]\.?[Mm]\.? %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(4))T%SUM%(%normDay(group(2)),12):group(3)",NORM_MOD="%normApprox4Dates(group(1))" +RULENAME="time_r6e",EXTRACTION="(?:%reApproximate )?%reTimeHour:%reTimeMinute %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(4))T%normDay(group(2)):group(3)",NORM_MOD="%normApprox4Dates(group(1))" // time_r7 // added 2015-03-18 (jannik) -RULENAME="time_r7a",EXTRACTION="(%reApproximate )?%reTimeHour%reTimeMinute %reTimezone",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(3)):group(4)",NORM_MOD="%normApprox4Dates(group(2))" -RULENAME="time_r7b",EXTRACTION="((%reApproximate )?%reTimeHour%reTimeMinute %reTimezone) on %reMonthLong %reDayNumber",NORM_VALUE="UNDEF-year-%normMonth(group(7))-%normDay(group(8))T%normDay(group(4)):group(5)",NORM_MOD="%normApprox4Dates(group(3))",OFFSET="group(1)-group(1)" +RULENAME="time_r7a",EXTRACTION="(%reApproximate )?%reTimeHour:?%reTimeMinute %reTimezone",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(3)):group(4)",NORM_MOD="%normApprox4Dates(group(2))" +RULENAME="time_r7b",EXTRACTION="((%reApproximate )?%reTimeHour:?%reTimeMinute %reTimezone) on %reMonthLong %reDayNumber",NORM_VALUE="UNDEF-year-%normMonth(group(7))-%normDay(group(8))T%normDay(group(4)):group(5)",NORM_MOD="%normApprox4Dates(group(3))",OFFSET="group(1)-group(1)" // time_r8 // EXAMPLE r8a: the morning of April 18, 1775 -// EXAMPLE r8c: the morning of April 18 -RULENAME="time_r8a",EXTRACTION="([Tt]he )?%rePartOfDay of (%reMonthLong|%reMonthShort) (%reDayNumberTh|%reDayNumber)[\s]?,? %reYear4Digit(, %reWeekday)?",NORM_VALUE="group(9)-%normMonth(group(3))-%normDay(group(6))T%normPartOfDay(group(2))" -RULENAME="time_r8b",EXTRACTION="([Tt]he )?%rePartOfDay of (%reMonthLong|%reMonthShort) (%reDayNumberTh|%reDayNumber)",NORM_VALUE="UNDEF-year-%normMonth(group(3))-%normDay(group(6))T%normPartOfDay(group(2))" -RULENAME="time_r8c",EXTRACTION="([Tt]he )?%rePartOfDay of (the )?(%reDayWordTh|%reDayNumberTh|%reDayNumber)( of | )(%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(12)-%normMonth(group(9))-%normDay(group(4))T%normPartOfDay(group(2))" -RULENAME="time_r8d",EXTRACTION="([Tt]he )?%rePartOfDay of (the )?(%reDayWordTh|%reDayNumberTh|%reDayNumber)( of | )(%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(9))-%normDay(group(4)))T%normPartOfDay(group(2))" +// EXAMPLE r8b: the morning of April 18 +RULENAME="time_r8a",EXTRACTION="(?:[Tt]he )?%rePartOfDay of %(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh)\s?,? %reYear4Digit(?:, %reWeekday)?",NORM_VALUE="group(4)-%normMonth(group(2))-%normDay(group(3))T%normPartOfDay(group(1))" +RULENAME="time_r8b",EXTRACTION="(?:[Tt]he )?%rePartOfDay of %(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh)",NORM_VALUE="UNDEF-year-%normMonth(group(2))-%normDay(group(3))T%normPartOfDay(group(1))" +RULENAME="time_r8c",EXTRACTION="(?:[Tt]he )?%rePartOfDay of (?:the )?%(reDayNumber|reDayNumberTh|reDayWordTh) (?:of )?%(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(3))-%normDay(group(2))T%normPartOfDay(group(1))" +RULENAME="time_r8d",EXTRACTION="(?:[Tt]he )?%rePartOfDay of (?:the )?%(reDayNumber|reDayNumberTh|reDayWordTh) (?:of )?%(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(3))-%normDay(group(2))T%normPartOfDay(group(1))" diff --git a/resources/german/normalization/resources_normalization_normMonth.txt b/resources/german/normalization/resources_normalization_normMonth.txt index 028420ee..602e6e7f 100644 --- a/resources/german/normalization/resources_normalization_normMonth.txt +++ b/resources/german/normalization/resources_normalization_normMonth.txt @@ -1,89 +1,61 @@ // author: Jannik Strötgen -// email: stroetgen@uni-hd.de +// email: stroetgen@uni-hd\.de // date: 2011-06-10 // This file contains "month words" and their normalized expressions -// according to TIMEX3 format. +// according to TIMEX3 format\. // For example, the normalized value of "2" is "02" // FORMAT: "month-word","normalized-month-word" // Full month names (including historic writings) // January -"[Jj]anuar","01" +"[Jj]anuar(ii|y|)","01" "[Jj]änner","01" -"[Jj]anuarii","01" -"[Ff]ebruar","02" -"[Ff]ebruar(ii)?","02" +"[Ff]ebruar(ii|y|)","02" "[Hh]ornung","02" "[Mm]ärz","03" +"[Mm]arch","03" "[Mm]aerz","03" "[Mm]art(ii)?","03" "[Mm]erz","03" -"[Aa]pril","04" -"[Aa]prilis","04" -"[Mm]ai","05" -"[Mm]ai(i)?","05" +"[Aa]pril(is)?","04" +"[Mm]aii?","05" "[Mm]ay","05" -"[Jj]uni","06" -"[Jj]unii","06" -"[Jj]uli","07" -"[Jj]ulii","07" -"[Aa]ugust","08" -"[Aa]ugusti","08" +"[Jj]un(ii?|y)","06" +"[Jj]ul(ii?|y)","07" +"[Aa]ugusti?","08" "[Ss]eptember","09" -"[Ss]eptembr(is|.)?","09" -"[Oo]ktober","10" -"[Oo]ctober","10" -"[Oo]ctobr(is|.)","10" +"[Ss]eptembr(is|\.)?","09" +"[Oo][ck]tober","10" +"[Oo]ctobr(is|\.)?","10" "[Nn]ovember","11" -"[Nn]ovembr(is|.)","11" -"[Dd]ezember","12" -"[Dd]ecember","12" -"[Dd]ecembr(is|.)","12" +"[Nn]ovembr(is|\.)?","11" +"[Dd]e[cz]ember","12" +"[Dd]ecembr(is|\.)?","12" // Abbreviated month names -"[Jj]an","01" -"[Jj]an.","01" -"[Ff]eb","02" -"[Ff]eb.","02" -"[Mm]är","03" -"[Mm]är.","03" -"[Aa]pr","04" -"[Aa]pr.","04" -"[Mm]ai.","05" -"[Jj]un","06" -"[Jj]un.","06" -"[Jj]ul","07" -"[Jj]ul.","07" -"[Aa]ug","08" -"[Aa]ug.","08" -"[Ss]ept?","09" -"[Ss]ept?.","09" -"[Oo]kt","10" -"[Oo]kt.","10" -"[Nn]ov","11" -"[Nn]ov.","11" -"[Dd]ez","12" -"[Dd]ez.","12" +"[Jj]an\.?","01" +"[Ff]eb\.?","02" +"[Mm][aä]r\.?","03" +"[Aa]pr\.?","04" +"[Mm]a[iy]","05" +"[Jj]un\.?","06" +"[Jj]ul\.?","07" +"[Aa]ug\.?","08" +"[Ss]ept?\.?","09" +"[Oo][ck]t\.?","10" +"[Nn]ov\.?","11" +"[Dd]e[cz]\.?","12" // numbers -"1","01" -"01","01" -"2","02" -"02","02" -"3","03" -"03","03" -"4","04" -"04","04" -"5","05" -"05","05" -"6","06" -"06","06" -"7","07" -"07","07" -"8","08" -"08","08" -"9","09" -"09","09" -"10","10" -"11","11" -"12","12" +"0?1\.?","01" +"0?2\.?","02" +"0?3\.?","03" +"0?4\.?","04" +"0?5\.?","05" +"0?6\.?","06" +"0?7\.?","07" +"0?8\.?","08" +"0?9\.?","09" +"10\.?","10" +"11\.?","11" +"12\.?","12" diff --git a/resources/german/normalization/resources_normalization_normMonthToEnglish.txt b/resources/german/normalization/resources_normalization_normMonthToEnglish.txt index 54c5d2f0..30ccc60e 100644 --- a/resources/german/normalization/resources_normalization_normMonthToEnglish.txt +++ b/resources/german/normalization/resources_normalization_normMonthToEnglish.txt @@ -1,78 +1,53 @@ // author: Jannik Strötgen -// email: stroetgen@uni-hd.de +// email: stroetgen@uni-hd\.de // date: 2011-09-13 // This file contains "month words" and their English expressions -// according to TIMEX3 format. +// according to TIMEX3 format\. // For example, the normalized value of "2" is "february" // FORMAT: "month-word","normalized-month-word" - - // January -"[Jj]an","january" -"[Jj]an.","january" -"[Jj]anuar","january" -"[Jj]anuarii","january" +"[Jj]an\.?","january" +"[Jj]anuar(ii|y|)","january" "[Jj]änner","january" // February -"[Ff]eb","february" -"[Ff]eb.","february" -"[Ff]ebruar","february" -"[Ff]ebruarii","february" +"[Ff]eb\.?","february" +"[Ff]ebruar(ii|y|)","february" "[Hh]ornung","february" // March -"[Mm]är","march" -"[Mm]är.","march" +"[Mm]är[z.]?","march" +"[Mm]arch","march" "[Mm]aerz","march" -"[Mm]ärz","march" "[Mm]art(ii)?","march" "[Mm]erz","march" // April -"[Aa]pr","april" -"[Aa]pr.","april" -"[Aa]pril","april" -"[Aa]prilis","april" +"[Aa]pr\.?","april" +"[Aa]pril(is)?","april" // May -"[Mm]ai","may" -"[Mm]ai.","may" -"[Mm]aii","may" -"[Mm]ay","may" +"[Mm]a(ii?|y)\.?","may" // June -"[Jj]un","june" -"[Jj]un.","june" -"[Jj]uni","june" -"[Jj]unii","june" +"[Jj]un\.?","june" +"[Jj]un(ii?|y)","june" // July -"[Jj]ul","july" -"[Jj]ul.","july" -"[Jj]uli","july" -"[Jj]ulii","july" +"[Jj]ul\.?","july" +"[Jj]ul(ii?|y)","july" // August -"[Aa]ug","august" -"[Aa]ug.","august" +"[Aa]ug\.?","august" "[Aa]ugust","august" -"[Aa]ugusti","august" +"[Aa]ugusti?","august" // September -"[Ss]ept?","september" -"[Ss]ept?.","september" -"Sept","september" -"sept","september" +"[Ss]ept?\.?","september" "[Ss]eptember","september" -"[Ss]eptembr(is|.)?","september" +"[Ss]eptembr(is|\.)?","september" // October -"[Oo]kt","october" -"[Oo]kt.","october" -"[Oo]ktober","october" -"[Oo]ctober","october" -"[Oo]ctobr(is|.)","october" +"[Oo][ck]t\.?","october" +"[Oo][ck]tober","october" +"[Oo]ctobr(is|\.)?","october" // November -"[Nn]ov","november" -"[Nn]ov.","november" +"[Nn]ov\.?","november" "[Nn]ovember","november" -"[Nn]ovembr(is|.)","november" +"[Nn]ovembr(is|\.)?","november" // December -"[Dd]ez","december" -"[Dd]ez.","december" -"[Dd]ezember","december" -"[Dd]ecember","december" -"[Dd]ecembr(is|.)","december" +"[Dd]e[cz]\.?","december" +"[Dd]e[cz]ember","december" +"[Dd]ecembr(is|\.)?","december" diff --git a/resources/german/normalization/resources_normalization_normPartOfDay.txt b/resources/german/normalization/resources_normalization_normPartOfDay.txt index d017ddf1..a682142c 100644 --- a/resources/german/normalization/resources_normalization_normPartOfDay.txt +++ b/resources/german/normalization/resources_normalization_normPartOfDay.txt @@ -6,33 +6,12 @@ // For example, the normalized value of "vormittags" is "MO" // FORMAT: "times-of-day-word","normalized-times-of-day-word" // morning -"morgens","MO" -"Morgens","MO" -"morgen","MO" -"Morgen","MO" -"vormittags","MO" -"Vormittags","MO" -"vormittag","MO" -"Vormittag","MO" +"[Mm]orgen(?:stunden|s|)","MO" +"[Vv]ormittags?","MO" // mid-day -"mittag","12:00" -"Mittag","12:00" -"mittags","12:00" -"Mittags","12:00" -"nachmittag","AF" -"Nachmittag","AF" -"nachmittags","AF" -"Nachmittags","AF" -"abend","EV" -"Abend","EV" -"abends","EV" -"Abends","EV" +"[Mm]ittags?(?:stunden|)","12:00" +"[Nn]achmittags?","AF" +"[Aa]bend(?:stunden|s|)","EV" // night -"nacht","NI" -"Nacht","NI" -"nachts","NI" -"Nachts","NI" -"mitternacht","24:00" -"Mitternacht","24:00" -"mitternachts","24:00" -"Mitternachts","24:00" +"[Nn]acht(?:stunden|s|)","NI" +"[Mm]itternachts?","24:00" diff --git a/resources/german/normalization/resources_normalization_normThisNextLast.txt b/resources/german/normalization/resources_normalization_normThisNextLast.txt index c8bebd50..9ce6f76e 100644 --- a/resources/german/normalization/resources_normalization_normThisNextLast.txt +++ b/resources/german/normalization/resources_normalization_normThisNextLast.txt @@ -3,93 +3,13 @@ // date: 2011-06-10 // This file contains "this/next/last words" and their normalized expressions. // FORMAT: "this/next/last-word","normalized-this/next/last-word" -"letzte","last" -"letzter","last" -"letztes","last" -"letzten","last" -"letztem","last" -"Letzte","last" -"Letzter","last" -"Letztes","last" -"Letzten","last" -"Letztem","last" -"nächste","next" -"nächster","next" -"nächstes","next" -"nächsten","next" -"nächstem","next" -"Nächste","next" -"Nächster","next" -"Nächstes","next" -"Nächsten","next" -"Nächstem","next" -"folgende","next" -"folgender","next" -"folgendes","next" -"folgenden","next" -"folgendem","next" -"Folgende","next" -"Folgender","next" -"Folgendes","next" -"Folgenden","next" -"Folgendem","next" -"vorige","last" -"voriger","last" -"voriges","last" -"vorigen","last" -"vorigem","last" -"Vorige","last" -"Voriger","last" -"Voriges","last" -"Vorigen","last" -"Vorigem","last" -"diese","this" -"dieser","this" -"dieses","this" -"diesen","this" -"diesem","this" -"Diese","this" -"Dieser","this" -"Dieses","this" -"Diesen","this" -"Diesem","this" -"Selbe","this" -"Selber","this" -"Selbes","this" -"Selben","this" -"Selbem","this" -"selbe","this" -"selber","this" -"selbes","this" -"selben","this" -"selbem","this" -"Gleiche","this" -"Gleicher","this" -"Gleiches","this" -"Gleichen","this" -"Gleichem","this" -"gleiche","this" -"gleicher","this" -"gleiches","this" -"gleichen","this" -"gleichem","this" -"Kommende","next" -"Kommender","next" -"Kommendes","next" -"Kommenden","next" -"Kommendem","next" -"kommende","next" -"kommender","next" -"kommendes","next" -"kommenden","next" -"kommendem","next" -"Vergangene","last" -"Vergangener","last" -"Vergangenes","last" -"Vergangenen","last" -"Vergangenem","last" -"vergangene","last" -"vergangener","last" -"vergangenes","last" -"vergangenen","last" -"vergangenem","last" +"[Ll]etzte[rsnm]?","last" +"[Nn]ächste[rsnm]?","next" +"[Nn]eue[rsnm]?","next" +"[Ff]olgende[rsnm]?","next" +"[Vv]orige[rsnm]?","last" +"[Dd]iese[rsnm]?","this" +"[Ss]elbe[rsnm]?","this" +"[Gg]leiche[rsnm]?","this" +"[Kk]ommende[rsnm]?","next" +"[Vv]ergangene[rsnm]?","last" diff --git a/resources/german/normalization/resources_normalization_normUnit.txt b/resources/german/normalization/resources_normalization_normUnit.txt index 82a88e9c..1ed165da 100644 --- a/resources/german/normalization/resources_normalization_normUnit.txt +++ b/resources/german/normalization/resources_normalization_normUnit.txt @@ -5,60 +5,17 @@ // according to TIMEX3 format. // For example, the normalized value of "Woche" is "week" // FORMAT: "unit-word","normalized-unit-word" -"Handelstag","day" -"Handelstage","day" -"Handelstagen","day" -"Tag","day" -"Tage","day" -"Tagen","day" -"Wochenende","week-WE" -"Wochenenden","week-WE" -"Woche","week" -"Wochen","week" -"Monat","month" -"Monate","month" -"Monaten","month" -"Quartal","quarter" -"Quartale","quarter" -"Quartalen","quarter" -"Jahr","year" -"Jahre","year" -"Jahren","year" -"Jahrzehnt","decade" -"Jahrzehnte","decade" -"Jahrzehnten","decade" -"Jahrhundert","century" -"Jahrhunderte","century" -"Jahrhunderten","century" -"Jh\.","century" -// LOWERCASE -"handelstag","day" -"handelstage","day" -"handelstagen","day" -"tag","day" -"tage","day" -"tagen","day" -"wochenende","week-WE" -"wochenenden","week-WE" -"woche","week" -"wochen","week" -"monat","month" -"monate","month" -"monaten","month" -"quartal","quarter" -"quartale","quarter" -"quartalen","quarter" -"jahr","year" -"jahre","year" -"jahren","year" -"jahrzehnt","decade" -"jahrzehnte","decade" -"jahrzehnten","decade" -"jahrhundert","century" -"jahrhunderte","century" -"jahrhunderten","century" -// not in reUnit -"Stunde","hour" -"Stunden","hour" -"Minute","minute" -"Minuten","minute" +"[Hh]andelstage?n?","day" +"[Tt]age?n?","day" +"[Ww]ochenenden?","week-WE" +"[Ww]ochen?","week" +"[Mm]onate?n?","month" +"[Qq]uartale?n?","quarter" +"[Jj]ahre?n?","year" +"[Jj]ahrzehnte?n?","decade" +"[Jj]ahrhunderte?n?","century" +"[Jj]hd?t?\.","century" +// not in reUnit, but reUnitFine +"[Ss]tunden?","hour" +"[Mm]inuten?","minute" +"[Ss]ekunden?","second" diff --git a/resources/german/normalization/resources_normalization_normUnit4Duration.txt b/resources/german/normalization/resources_normalization_normUnit4Duration.txt index 2c87d67f..56c1f11d 100644 --- a/resources/german/normalization/resources_normalization_normUnit4Duration.txt +++ b/resources/german/normalization/resources_normalization_normUnit4Duration.txt @@ -5,60 +5,17 @@ // according to TIMEX3 format. // For example, the normalized value of "Woche" is "WE" // FORMAT: "unit-word","normalized-unit-word" -"Handelstag","D" -"Handelstage","D" -"Handelstagen","D" -"Tag","D" -"Tage","D" -"Tagen","D" -"Wochenende","WE" -"Wochenenden","WE" -"Woche","W" -"Wochen","W" -"Monat","M" -"Monate","M" -"Monaten","M" -"Quartal","Q" -"Quartale","Q" -"Quartalen","Q" -"Jahr","Y" -"Jahre","Y" -"Jahren","Y" -"Jahrzehnt","DE" -"Jahrzehnte","DE" -"Jahrzehnten","DE" -"Jahrhundert","CE" -"Jahrhunderte","CE" -"Jahrhunderten","CE" -"Jh\.","CE" -// LOWERCASE -"handelstag","D" -"handelstage","D" -"handelstagen","D" -"tag","D" -"tage","D" -"tagen","D" -"wochenende","WE" -"wochenenden","WE" -"woche","W" -"wochen","W" -"monat","M" -"monate","M" -"monaten","M" -"quartal","Q" -"quartale","Q" -"quartalen","Q" -"jahr","Y" -"jahre","Y" -"jahren","Y" -"jahrzehnt","DE" -"jahrzehnte","DE" -"jahrzehnten","DE" -"jahrhundert","CE" -"jahrhunderte","CE" -"jahrhunderten","CE" +"[Hh]andelstage?n?","D" +"[Tt]age?n?","D" +"[Ww]ochenenden?","WE" +"[Ww]ochen?","W" +"[Mm]onate?n?","M" +"[Qq]uartale?n?","Q" +"[Jj]ahre?n?","Y" +"[Jj]ahrzehnte?n?","DE" +"[Jj]ahrhunderte?n?","CE" +"[Jj]hd?t?\.","CE" // not in reUnit -"Stunde","H" -"Stunden","H" -"Minute","M" -"Minuten","M" +"[Ss]tunden?","H" +"[Mm]inuten?","M" +"[Ss]ekunden?","S" diff --git a/resources/german/repattern/resources_repattern_reAndOrTo.txt b/resources/german/repattern/resources_repattern_reAndOrTo.txt index 4e1c8fb8..9b764911 100644 --- a/resources/german/repattern/resources_repattern_reAndOrTo.txt +++ b/resources/german/repattern/resources_repattern_reAndOrTo.txt @@ -3,10 +3,5 @@ // date: 2013-10-17 // This file contains regular expression patterns for "and", "or", "to" words. // FORMAT: one line is one disjunction of the pattern -[\s]?\–[\s]? -[\s]?-[\s]? -[\s]?–[\s]? -[\s]?/[\s]? - und (zum )? - bis (zum )? - oder (zum)? +\s?[/–‒‑-]\s? + (?:und|bis|oder)(?:zum | ) diff --git a/resources/german/repattern/resources_repattern_reApproximate.txt b/resources/german/repattern/resources_repattern_reApproximate.txt index 628a90a5..eb881b3e 100644 --- a/resources/german/repattern/resources_repattern_reApproximate.txt +++ b/resources/german/repattern/resources_repattern_reApproximate.txt @@ -7,6 +7,9 @@ [Ee]twa [Uu]ngefähr [Cc]irca +[Gg]egen +[Ff]rüh(?:en) +[Ss]pät(?:en) // less [Nn]icht mehr als [Nn]icht länger als diff --git a/resources/german/repattern/resources_repattern_reDayNumberTh.txt b/resources/german/repattern/resources_repattern_reDayNumberTh.txt index d86a9cbb..1b2f4721 100644 --- a/resources/german/repattern/resources_repattern_reDayNumberTh.txt +++ b/resources/german/repattern/resources_repattern_reDayNumberTh.txt @@ -3,4 +3,8 @@ // date: 2011-06-10 // This file contains regular expression patterns for day digit th. // FORMAT: one line is one disjunction of the pattern -DUMMY-FOR-NUMBERTH \ No newline at end of file +0[0-9]\. +[1-9]\. +1[0-9]\. +2[0-9]\. +3[01]\. diff --git a/resources/german/repattern/resources_repattern_reHolidayFix.txt b/resources/german/repattern/resources_repattern_reHolidayFix.txt index 4796cf48..4aa30026 100644 --- a/resources/german/repattern/resources_repattern_reHolidayFix.txt +++ b/resources/german/repattern/resources_repattern_reHolidayFix.txt @@ -10,7 +10,7 @@ Neujahrstag Neujahrsfest // http://de.wikipedia.org/wiki/Heilige_Drei_K%C3%B6nige -[Hh]eilig(en?)? [Dd]rei König(en?)? +[Hh]eilig(?:en?|) [Dd]rei König(?:en?|) Epiphanias Epiphanie Erscheinung des Herrn @@ -49,7 +49,7 @@ Allerheiligen Weihnachten Weihnachtsabend Heiligabend -[Hh]eilig(en)? Abend +[Hh]eilig(?:en|) Abend Weihnacht [Hh]eiliges Christfest Christfest diff --git a/resources/german/repattern/resources_repattern_reMonthLong.txt b/resources/german/repattern/resources_repattern_reMonthLong.txt index b62ee369..1478c099 100644 --- a/resources/german/repattern/resources_repattern_reMonthLong.txt +++ b/resources/german/repattern/resources_repattern_reMonthLong.txt @@ -3,27 +3,26 @@ // date: 2011-06-10 // This file contains regular expression patterns for long months. // FORMAT: one line is one disjunction of the pattern -[Jj]anuar(ii)? +[Jj]anuar(?:ii|y|) [Jj]änner -[Ff]ebruar(ii)? +[Ff]ebruar(?:ii|y|) [Hh]ornung [Mm]ärz [Mm]aerz +[Mm]arch [Mm]erz -[Mm]art(ii)? -[Aa]pril(is)? -[Mm]ai(i)? +[Mm]art(?:ii)? +[Aa]pril(?:is)? +[Mm]aii? [Mm]ay -[Jj]uni(i)? -[Jj]uli(i)? -[Aa]ugust(i)? +[Jj]un(?:ii?|y) +[Jj]ul(?:ii?|y) +[Aa]ugusti? [Ss]eptember -[Ss]eptembr(is|.)? -[Oo]ktober -[Oo]ctober -[Oo]ctobr(is|.) +[Ss]eptembr(?:is|\.)? +[Oo][ck]tober +[Oo]ctobr(?:is|\.)? [Nn]ovember -[Nn]ovembr(is|.) -[Dd]ezember -[Dd]ecember -[Dd]ecembr(is|.) +[Nn]ovembr(?:is|\.)? +[Dd]e[cz]ember +[Dd]ecembr(?:is|\.)? diff --git a/resources/german/repattern/resources_repattern_reMonthShort.txt b/resources/german/repattern/resources_repattern_reMonthShort.txt index f0000eac..399809af 100644 --- a/resources/german/repattern/resources_repattern_reMonthShort.txt +++ b/resources/german/repattern/resources_repattern_reMonthShort.txt @@ -7,11 +7,11 @@ [Jj]an [Ff]eb\. [Ff]eb -[Mm]är\. -[Mm]är +[Mm][aä]r\. +[Mm][aä]r [Aa]pr\. [Aa]pr -[Mm]ai +[Mm]a[iy] [Jj]un\. [Jj]un [Jj]ul\. @@ -22,9 +22,9 @@ [Ss]ep [Ss]ept\. [Ss]ept -[Oo]kt\. -[Oo]kt +[Oo][ck]t\. +[Oo][ck]t [Nn]ov\. [Nn]ov -[Dd]ez\. -[Dd]ez \ No newline at end of file +[Dd]e[cz]\. +[Dd]e[cz] diff --git a/resources/german/repattern/resources_repattern_rePartOfDay.txt b/resources/german/repattern/resources_repattern_rePartOfDay.txt index 8d485d0e..5722da23 100644 --- a/resources/german/repattern/resources_repattern_rePartOfDay.txt +++ b/resources/german/repattern/resources_repattern_rePartOfDay.txt @@ -5,8 +5,8 @@ // FORMAT: one line is one disjunction of the pattern [Vv]ormittag [Nn]achmittag -[Mm]ittag +[Mm]ittag(?:sstunden|) [Mm]itternacht -[Nn]acht -[Mm]orgen -[Aa]bend \ No newline at end of file +[Nn]acht(?:stunden|) +[Mm]orgen(?:stunden|) +[Aa]bend(?:stunden|) diff --git a/resources/german/repattern/resources_repattern_rePartOfYear.txt b/resources/german/repattern/resources_repattern_rePartOfYear.txt index 2ccdd1da..77501b46 100644 --- a/resources/german/repattern/resources_repattern_rePartOfYear.txt +++ b/resources/german/repattern/resources_repattern_rePartOfYear.txt @@ -9,15 +9,12 @@ [Dd]ritte[ns]? Quartals? [Vv]ierte[ns]? Quartals? [Ll]etzte[ns]? Quartals? -1\. Quartals? -2\. Quartals? -3\. Quartals? -4\. Quartals? +[1-4]\. Quartals? // Jahreshälften [Ee]rste[ns]? Hälfte [Zz]weite[ns]? Hälfte [Ll]etzte[ns]? Hälfte -[Ee]rste[ns]? Halbjahr(es)? -[Zz]weite[ns]? Halbjahr(es)? +[Ee]rste[ns]? Halbjahr(?:es|) +[Zz]weite[ns]? Halbjahr(?:es|) [Ee]rste[ns]? Jahreshälfte [Zz]weite[ns]? Jahreshälfte \ No newline at end of file diff --git a/resources/german/repattern/resources_repattern_reThisNextLast.txt b/resources/german/repattern/resources_repattern_reThisNextLast.txt index 03402285..3e8f1019 100644 --- a/resources/german/repattern/resources_repattern_reThisNextLast.txt +++ b/resources/german/repattern/resources_repattern_reThisNextLast.txt @@ -5,6 +5,7 @@ // FORMAT: one line is one disjunction of the pattern [Ll]etzte[rsnm]? [Nn]ächste[rsnm]? +[Nn]eue[rsnm]? [Ff]olgende[rsnm]? [Dd]iese[rsnm]? [Vv]orige[rsnm]? diff --git a/resources/german/repattern/resources_repattern_reTimeHour.txt b/resources/german/repattern/resources_repattern_reTimeHour.txt index 9e5d33a6..ae842ddd 100644 --- a/resources/german/repattern/resources_repattern_reTimeHour.txt +++ b/resources/german/repattern/resources_repattern_reTimeHour.txt @@ -3,28 +3,6 @@ // date: 2011-06-10 // This file contains regular expression patterns for time hours. // FORMAT: one line is one disjunction of the pattern -10 -11 -12 -13 -14 -15 -16 -17 -18 -19 -20 -21 -22 -23 -24 -1 -2 -3 -4 -5 -6 -7 -8 -9 -0 \ No newline at end of file +[01][0-9]? +2[0-4]? +[3-9] diff --git a/resources/german/repattern/resources_repattern_reTimeMinute.txt b/resources/german/repattern/resources_repattern_reTimeMinute.txt index 6e560f7a..15189854 100644 --- a/resources/german/repattern/resources_repattern_reTimeMinute.txt +++ b/resources/german/repattern/resources_repattern_reTimeMinute.txt @@ -3,4 +3,4 @@ // date: 2011-06-10 // This file contains regular expression patterns for time minutes. // FORMAT: one line is one disjunction of the pattern -[0|1|2|3|4|5][0-9] \ No newline at end of file +[0-5][0-9] diff --git a/resources/german/repattern/resources_repattern_reUnit.txt b/resources/german/repattern/resources_repattern_reUnit.txt index e4444820..238d911a 100644 --- a/resources/german/repattern/resources_repattern_reUnit.txt +++ b/resources/german/repattern/resources_repattern_reUnit.txt @@ -3,29 +3,15 @@ // date: 2011-06-10 // This file contains regular expression patterns for unit words. // FORMAT: one line is one disjunction of the pattern -[Hh]andelstagen -[Hh]andelstage -[Hh]andelstag -[Tt]agen -[Tt]age -[Tt]ag -[Ww]ochenenden -[Ww]ochenende -[Ww]ochen -[Ww]oche -[Mm]onaten -[Mm]onate -[Mm]onat -[Qq]uartalen -[Qq]uartale -[Qq]uartal -[Jj]ahren -[Jj]ahre -[Jj]ahr -[Jj]ahrzehnten -[Jj]ahrzehnte -[Jj]ahrzehnt -[Jj]ahrhunderten -[Jj]ahrhunderte -[Jj]ahrhundert -Jh\. \ No newline at end of file +[Hh]andelstag(?:en|e|) +[Tt]ag(?:en|e|) +[Ww]ochenenden? +[Ww]ochen? +[Mm]onat(?:en|e|) +[Qq]uartal(?:en|e|) +[Jj]ahr(?:en|e|) +[Jj]ahrzehnt(?:en|e|) +[Jj]ahrhundert(?:en|e|) +Jh\. +Jhd\. +Jhdt\. diff --git a/resources/german/repattern/resources_repattern_reUnitFine.txt b/resources/german/repattern/resources_repattern_reUnitFine.txt new file mode 100644 index 00000000..b1376915 --- /dev/null +++ b/resources/german/repattern/resources_repattern_reUnitFine.txt @@ -0,0 +1,20 @@ +// author: Jannik Strötgen +// email: stroetgen@uni-hd.de +// date: 2011-06-10 +// This file contains regular expression patterns for unit words. +// FORMAT: one line is one disjunction of the pattern +[Ss]tunden? +[Mm]inuten? +[Ss]ekunden? +[Hh]andelstag(?:en|e|) +[Tt]ag(?:en|e|) +[Ww]ochenenden? +[Ww]ochen? +[Mm]onat(?:en|e|) +[Qq]uartal(?:en|e|) +[Jj]ahr(?:en|e|) +[Jj]ahrzehnt(?:en|e|) +[Jj]ahrhundert(?:en|e|) +Jh\. +Jhd\. +Jhdt\. diff --git a/resources/german/repattern/resources_repattern_reYearPrefix.txt b/resources/german/repattern/resources_repattern_reYearPrefix.txt index 6afacbb3..26737e19 100644 --- a/resources/german/repattern/resources_repattern_reYearPrefix.txt +++ b/resources/german/repattern/resources_repattern_reYearPrefix.txt @@ -4,14 +4,14 @@ // This file contains regular expression patterns for year numbers (4 digits). // FORMAT: one line is one disjunction of the pattern BC -B[\.]C[\.] -B[\.]C +B\.C\. +B\.C AD -A[\.]D[\.] -A[\.]D -v[\.] Chr[\.] -n[\.] Chr[\.] -vor Chr[\.] -nach Chr[\.] +A\.D\. +A\.D +v\. Chr\. +n\. Chr\. +vor Chr\. +nach Chr\. vor Christus nach Christus \ No newline at end of file diff --git a/resources/german/rules/resources_rules_daterules.txt b/resources/german/rules/resources_rules_daterules.txt index 17c859df..d2c9a6aa 100644 --- a/resources/german/rules/resources_rules_daterules.txt +++ b/resources/german/rules/resources_rules_daterules.txt @@ -15,12 +15,12 @@ // EXAMPLE date_historic_1d-BCADhint: Anfang 190 v. Chr. (1- to 4-digit year) // EXAMPLE date_historic_1e-BCADhint: Anfang v. Chr. 190 v. Chr. (1- to 4-digit year) // EXAMPLE date_historic_1f-BCADhint: Anfang 190 bis 180 v. Chr. (find "Anfang 190 v. Chr."; 1- to 4-digit year) -RULENAME="date_historic_1a-BCADhint",EXTRACTION="(%reApproximate )?(Jahr(e)?([ns])? )?%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(7))%normYearBC(group(6))" -RULENAME="date_historic_1b-BCADhint",EXTRACTION="(%reApproximate )?(Jahr(e)?([ns])? )?%reYearPrefix %reYearBC",NORM_VALUE="%normYearPrefix(group(6))%normYearBC(group(7))" -RULENAME="date_historic_1c-BCADhint",EXTRACTION="(%reApproximate )?(Jahr(e)?([ns])? )?%reYearBC%reAndOrTo%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(9))%normYearBC(group(6))",OFFSET="group(0)-group(6)" -RULENAME="date_historic_1d-BCADhint",EXTRACTION="(%rePartWords )(Jahr(e)?([ns])? )?%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(7))%normYearBC(group(6))" -RULENAME="date_historic_1e-BCADhint",EXTRACTION="(%rePartWords )(Jahr(e)?([ns])? )?%reYearPrefix %reYearBC",NORM_VALUE="%normYearPrefix(group(6))%normYearBC(group(7))" -RULENAME="date_historic_1f-BCADhint",EXTRACTION="(%rePartWords )(Jahr(e)?([ns])? )?%reYearBC%reAndOrTo%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(9))%normYearBC(group(6))",OFFSET="group(0)-group(6)" +RULENAME="date_historic_1a-BCADhint",EXTRACTION="(?:%reApproximate |)(?:Jahre?[ns]? )?%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(3))%normYearBC(group(2))" +RULENAME="date_historic_1b-BCADhint",EXTRACTION="(?:%reApproximate |)(?:Jahre?[ns]? )?%reYearPrefix %reYearBC",NORM_VALUE="%normYearPrefix(group(2))%normYearBC(group(3))" +RULENAME="date_historic_1c-BCADhint",EXTRACTION="(?:%reApproximate |)(?:Jahre?[ns]? )?%reYearBC%reAndOrTo%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))%normYearBC(group(2))",OFFSET="group(0)-group(2)" +RULENAME="date_historic_1d-BCADhint",EXTRACTION="(?:%rePartWords )(?:Jahre?[ns]? )?%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(3))%normYearBC(group(2))" +RULENAME="date_historic_1e-BCADhint",EXTRACTION="(?:%rePartWords )(?:Jahre?[ns]? )?%reYearPrefix %reYearBC",NORM_VALUE="%normYearPrefix(group(2))%normYearBC(group(3))" +RULENAME="date_historic_1f-BCADhint",EXTRACTION="(?:%rePartWords )(?:Jahre?[ns]? )?%reYearBC%reAndOrTo%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))%normYearBC(group(2))",OFFSET="group(0)-group(2)" // historic dates; month granularity // EXAMPLE date_historic_2a-BCADhint: Januar 190 v. Chr. (1- to 4-digit year) @@ -29,44 +29,44 @@ RULENAME="date_historic_1f-BCADhint",EXTRACTION="(%rePartWords )(Jahr(e)?([ns])? // EXAMPLE date_historic_2d-BCADhint: Anfang Januar 190 v. Chr. (1- to 4-digit year) // EXAMPLE date_historic_2e: Anfang Januar 190 (3-digit year) // EXAMPLE date_historic_2f: Anfang Januar 90 (2-digit year) -RULENAME="date_historic_2a-BCADhint",EXTRACTION="(%reApproximate )?(%reMonthLong|%reMonthShort)( )%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(8))%normYearBC(group(7))-%normMonth(group(3))" -RULENAME="date_historic_2b",EXTRACTION="(%reApproximate )?(%reMonthLong|%reMonthShort)( )([\d][\d][\d])",NORM_VALUE="%normYearBC(group(7))-%normMonth(group(3))" -RULENAME="date_historic_2c",EXTRACTION="(%reApproximate )?(%reMonthLong|%reMonthShort)( )([\d][\d])",NORM_VALUE="UNDEF-centurygroup(7)-%normMonth(group(3))" -RULENAME="date_historic_2d-BCADhint",EXTRACTION="(%rePartWords )(%reMonthLong|%reMonthShort)( )%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(8))%normYearBC(group(7))-%normMonth(group(3))" -RULENAME="date_historic_2e",EXTRACTION="(%rePartWords )(%reMonthLong|%reMonthShort)( )([\d][\d][\d])",NORM_VALUE="%normYearBC(group(7))-%normMonth(group(3))" -RULENAME="date_historic_2f",EXTRACTION="(%rePartWords )(%reMonthLong|%reMonthShort)( )([\d][\d])",NORM_VALUE="UNDEF-centurygroup(7)-%normMonth(group(3))" +RULENAME="date_historic_2a-BCADhint",EXTRACTION="(?:%reApproximate |)%(reMonthLong|reMonthShort) %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normMonth(group(2))" +RULENAME="date_historic_2b",EXTRACTION="(?:%reApproximate |)%(reMonthLong|reMonthShort) (\d\d\d)",NORM_VALUE="%normYearBC(group(3))-%normMonth(group(2))" +RULENAME="date_historic_2c",EXTRACTION="(?:%reApproximate |)%(reMonthLong|reMonthShort) (\d\d)",NORM_VALUE="UNDEF-centurygroup(3)-%normMonth(group(2))" +RULENAME="date_historic_2d-BCADhint",EXTRACTION="(?:%rePartWords )%(reMonthLong|reMonthShort) %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normMonth(group(2))" +RULENAME="date_historic_2e",EXTRACTION="(?:%rePartWords )%(reMonthLong|reMonthShort) (\d\d\d)",NORM_VALUE="%normYearBC(group(3))-%normMonth(group(2))" +RULENAME="date_historic_2f",EXTRACTION="(?:%rePartWords )%(reMonthLong|reMonthShort) (\d\d)",NORM_VALUE="UNDEF-centurygroup(3)-%normMonth(group(2))" // historic dates; day granularity // EXAMPLE date_historic_3a-BCADhint: 1. Januar 190 v. Chr. (1- to 4-digit year) // EXAMPLE date_historic_3b: 1. Januar 190 (3-digit year) // EXAMPLE date_historic_3c: 1. Januar 90 (2-digit year) // EXAMPLE date_historic_3d: 1. - 15. Januar 90 (find "1. Januar 90"; 2-digit year) -RULENAME="date_historic_3a-BCADhint",EXTRACTION="%reDayNumber[\.]? (%reMonthLong|%reMonthShort)( des Jahres| im Jahre)? %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(7))%normYearBC(group(6))-%normMonth(group(2))-%normDay(group(1))" -RULENAME="date_historic_3b",EXTRACTION="%reDayNumber[\.]? (%reMonthLong|%reMonthShort)( des Jahres| im Jahre)? (\d\d\d)",NORM_VALUE="%normYearBC(group(6))-%normMonth(group(2))-%normDay(group(1))" -RULENAME="date_historic_3c",EXTRACTION="%reDayNumber[\.]? (%reMonthLong|%reMonthShort)( des Jahres| im Jahre)? %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(6)-%normMonth(group(2))-%normDay(group(1))" -RULENAME="date_historic_3d",EXTRACTION="(%reDayNumber[\.]?)%reAndOrTo%reDayNumber[\.]? (%reMonthLong|%reMonthShort)( des Jahres| im Jahre)? %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(9)-%normMonth(group(5))-%normDay(group(1))",OFFSET="group(1)-group(1)" +RULENAME="date_historic_3a-BCADhint",EXTRACTION="%reDayNumber\.? %(reMonthLong|reMonthShort)(?: des Jahres| im Jahre|) %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normMonth(group(2))-%normDay(group(1))" +RULENAME="date_historic_3b",EXTRACTION="%reDayNumber\.? %(reMonthLong|reMonthShort)(?: des Jahres| im Jahre|) (\d\d\d)",NORM_VALUE="%normYearBC(group(3))-%normMonth(group(2))-%normDay(group(1))" +RULENAME="date_historic_3c",EXTRACTION="%reDayNumber\.? %(reMonthLong|reMonthShort)(?: des Jahres| im Jahre|) %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(3)-%normMonth(group(2))-%normDay(group(1))" +RULENAME="date_historic_3d",EXTRACTION="(%reDayNumber\.?)%reAndOrTo%reDayNumber\.? %(reMonthLong|reMonthShort)(?: des Jahres| im Jahre|) %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(6)-%normMonth(group(5))-%normDay(group(1))",OFFSET="group(1)-group(1)" // historic dates; season granularity // EXAMPLE date_historic_4a-BCADhint: Winter 190 v. Chr. (1- to 4-digit year) // EXAMPLE date_historic_4b-BCADhint: Mitte Winter 190 v.Chr. (1- to 4-digit year) -RULENAME="date_historic_4a-BCADhint",EXTRACTION="(%reApproximate )?%reSeason %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))%normYearBC(group(4))-%normSeason(group(3))", -RULENAME="date_historic_4b-BCADhint",EXTRACTION="(%rePartWords )%reSeason %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))%normYearBC(group(4))-%normSeason(group(3))", +RULENAME="date_historic_4a-BCADhint",EXTRACTION="(?:%reApproximate |)%reSeason %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normSeason(group(2))", +RULENAME="date_historic_4b-BCADhint",EXTRACTION="(?:%rePartWords )%reSeason %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normSeason(group(2))", // historic dates; century granularity // EXAMPLE date_historic_5a-BCADhint: das 5. Jahrhundert v. Chr. -RULENAME="date_historic_5a-BCADhint",EXTRACTION="([Dd]as )?(%reDayNumber[\.]?|%reDayWordTh) (Jahrhundert[s]?|Jh\.) %reYearPrefix",NORM_VALUE="%normYearPrefix(group(6))%normDay(%SUM%(%normDay(group(2)),-1))" +RULENAME="date_historic_5a-BCADhint",EXTRACTION="(?:[Dd]as )?%(reDayNumber|reDayNumberTh|reDayWordTh) J(?:ahrhunderts?|h\.) %reYearPrefix",NORM_VALUE="%normYearPrefix(group(2))%normDay(%SUM%(%normDay(group(1)),-1))" // historic dates; year granularity // EXAMPLE date_historic_6a: Jahr 90 (2-digit year) // EXAMPLE date_historic_6b: Jahr 190 (3-digit year) -RULENAME="date_historic_6a",EXTRACTION="(Jahr(e)?([ns])?) (\d\d)",NORM_VALUE="%normYearBC(group(4))" -RULENAME="date_historic_6b",EXTRACTION="(Jahr(e)?([ns])?) (\d\d\d)",NORM_VALUE="%normYearBC(group(4))" +RULENAME="date_historic_6a",EXTRACTION="Jahre?[ns]? (\d\d)",NORM_VALUE="UNDEF-centurygroup(1)" +RULENAME="date_historic_6b",EXTRACTION="Jahre?[ns]? (\d\d\d)",NORM_VALUE="%normYearBC(group(1))" // historic dates; negative rules // EXAMPLE date_historic_0a_negative: MiG-190 (1- to 4-digit year) // EXAMPLE date_historic_0b_negative: 90 Menschen (2-digit year) -RULENAME="date_historic_0a_negative",EXTRACTION="(MiG(-)%reYearBC)",NORM_VALUE="REMOVE" -RULENAME="date_historic_0b_negative",EXTRACTION="(%reYear2Digit )([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(3):NN:",OFFSET="group(1)-group(1)" +RULENAME="date_historic_0a_negative",EXTRACTION="MiG-%reYearBC",NORM_VALUE="REMOVE" +RULENAME="date_historic_0b_negative",EXTRACTION="%reYear2Digit (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):NN:",OFFSET="group(1)-group(1)" //////////////////// // POSITIVE RULES // @@ -81,53 +81,53 @@ RULENAME="date_historic_0b_negative",EXTRACTION="(%reYear2Digit )([\S]+)",NORM_V // EXAMPLE r0c_1: 09/26/1999 // EXAMPLE r0d_1: 09/26/99 // EXAMPLE r0e_1: 7-14 (AP) (find 7-14) -RULENAME="date_r0a",EXTRACTION="(%reYear4Digit-%reMonthNumber-%reDayNumber)",NORM_VALUE="group(1)" -RULENAME="date_r0b",EXTRACTION="(%reMonthNumber-%reDayNumber-%reYear2Digit)",NORM_VALUE="UNDEF-centurygroup(4)-group(2)-group(3)" +RULENAME="date_r0a",EXTRACTION="%reYear4Digit-%reMonthNumber-%reDayNumber",NORM_VALUE="group(0)" +RULENAME="date_r0b",EXTRACTION="%reMonthNumber-%reDayNumber-%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(3)-group(1)-group(2)" RULENAME="date_r0c",EXTRACTION="%reMonthNumber/%reDayNumber/%reYear4Digit",NORM_VALUE="group(3)-%normMonth(group(1))-%normDay(group(2))" RULENAME="date_r0d",EXTRACTION="%reMonthNumber/%reDayNumber/%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(3)-%normMonth(group(1))-%normDay(group(2))" -RULENAME="date_r0e",EXTRACTION="%reMonthNumber-%reDayNumber( \(.*?\))",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(2))",OFFSET="group(1)-group(2)" +RULENAME="date_r0e",EXTRACTION="%reMonthNumber-%reDayNumber \(.*?\)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(2))",OFFSET="group(1)-group(2)" // date_r1 // EXAMPLE r1a_1: 1.3.99 // EXAMPLE r1b_1: 1.3.1999 -RULENAME="date_r1a",EXTRACTION="%reDayNumber[\.]%reMonthNumber[\.]%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(3)-%normMonth(group(2))-%normDay(group(1))" -RULENAME="date_r1b",EXTRACTION="%reDayNumber[\.]%reMonthNumber[\.]%reYear4Digit",NORM_VALUE="group(3)-%normMonth(group(2))-%normDay(group(1))" +RULENAME="date_r1a",EXTRACTION="%reDayNumber\.%reMonthNumber\.%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(3)-%normMonth(group(2))-%normDay(group(1))" +RULENAME="date_r1b",EXTRACTION="%reDayNumber\.%reMonthNumber\.%reYear4Digit",NORM_VALUE="group(3)-%normMonth(group(2))-%normDay(group(1))" // date_r2 (keep though it is English date format) // EXAMPLE r2a_1: Februar 25, 2009 // EXAMPLE r2a_2: Feb. 25, 2009 -RULENAME="date_r2a",EXTRACTION="(%reMonthLong|%reMonthShort) %reDayNumber[\s]?, %reYear4Digit",NORM_VALUE="group(5)-%normMonth(group(1))-%normDay(group(4))" +RULENAME="date_r2a",EXTRACTION="%(reMonthLong|reMonthShort) %reDayNumber\s?, %reYear4Digit",NORM_VALUE="group(3)-%normMonth(group(1))-%normDay(group(2))" // date_r3 // EXAMPLE r3a_1: 25. Februar 2009 // EXAMPLE r3a_2: 25 Feb 2009 // EXAMPLE r3a_3: 25 Feb. 2009 // EXAMPLE r3a_4: 25. November des Jahres 2001 -RULENAME="date_r3a",EXTRACTION="%reDayNumber[\.]? (%reMonthLong|%reMonthShort)( des Jahres| im Jahre)? %reYear4Digit",NORM_VALUE="group(6)-%normMonth(group(2))-%normDay(group(1))" +RULENAME="date_r3a",EXTRACTION="%reDayNumber\.? %(reMonthLong|reMonthShort)(?: des Jahres| im Jahre|) %reYear4Digit",NORM_VALUE="group(3)-%normMonth(group(2))-%normDay(group(1))" // date_r4 // EXAMPLE r4a_1: November 19 // EXAMPLE r4b_1: 19. November // EXAMPLE r4c_1: November 15 - 18 (find November 18) // EXAMPLE r4d_1: 19. und 20. Januar (find 19. Januar) -RULENAME="date_r4a",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(4))" -RULENAME="date_r4b",EXTRACTION="(%reDayWordTh|%reDayNumberTh|%reDayNumber[\.]?) (%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(5))-%normDay(group(1))" -RULENAME="date_r4c",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber)(\–| - | – |-|–)(%reDayWordTh|%reDayNumberTh|%reDayNumber)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(9))",OFFSET="group(9)-group(9)" -RULENAME="date_r4d",EXTRACTION="(%reDayWordTh|%reDayNumberTh|%reDayNumber[\.]?)%reAndOrTo(%reDayWordTh|%reDayNumberTh|%reDayNumber[.]?) (%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(10))-%normDay(group(1))",OFFSET="group(1)-group(1)" +RULENAME="date_r4a",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(2))" +RULENAME="date_r4b",EXTRACTION="%(reDayNumber|reDayNumberTh|reDayWordTh) %(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(2))-%normDay(group(1))" +RULENAME="date_r4c",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh)\s*%reAndOrTo\s*%(reDayNumber|reDayNumberTh|reDayWordTh)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(4))",OFFSET="group(4)-group(4)" +RULENAME="date_r4d",EXTRACTION="%(reDayNumber|reDayNumberTh|reDayWordTh)%reAndOrTo%(reDayNumber|reDayNumberTh|reDayWordTh) %(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(4))-%normDay(group(1))",OFFSET="group(1)-group(1)" // date_r5 // EXAMPLE r5a_1: Freitag Oktober 13 // EXAMPLE r5b_1: Freitag 13. Oktober -RULENAME="date_r5a",EXTRACTION="%reWeekday[,]? (%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber)",NORM_VALUE="UNDEF-year-%normMonth(group(2))-%normDay(group(5))" -RULENAME="date_r5b",EXTRACTION="%reWeekday[,]? (%reDayWordTh|%reDayNumberTh|%reDayNumber)[\.]? (%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(6))-%normDay(group(2))" +RULENAME="date_r5a",EXTRACTION="%reWeekday,? %(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh)",NORM_VALUE="UNDEF-year-%normMonth(group(2))-%normDay(group(3))" +RULENAME="date_r5b",EXTRACTION="%reWeekday,? %(reDayNumber|reDayNumberTh|reDayWordTh)\.? %(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(3))-%normDay(group(2))" // date_r6 // EXAMPLE r6a_1: 14. und 15. September 2010 (find: 14. September 2010) -RULENAME="date_r6a",EXTRACTION="(%reDayNumberTh|%reDayNumber)[\.]?%reAndOrTo(%reDayNumberTh|%reDayNumber)[\.]? (%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(11)-%normMonth(group(8))-%normDay(group(1))",OFFSET="group(1)-group(1)" +RULENAME="date_r6a",EXTRACTION="%(reDayNumberTh|reDayWordTh)%reAndOrTo%(reDayNumberTh|reDayWordTh) %(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(5)-%normMonth(group(4))-%normDay(group(1))",OFFSET="group(1)-group(1)" // date_r7 // EXAMPLE r7a_1: Friday Oktober 13 2009 -RULENAME="date_r7a",EXTRACTION="%reWeekday[,]? (%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber)[,]? %reYear4Digit",NORM_VALUE="group(9)-%normMonth(group(2))-%normDay(group(5))" +RULENAME="date_r7a",EXTRACTION="%reWeekday,? %(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh),? %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(2))-%normDay(group(3))" // date_r8 // EXAMPLE 8a_1: tomorrow @@ -144,8 +144,8 @@ RULENAME="date_r9a",EXTRACTION="%reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(g // EXAMPLE r10a_1: November 2001 // EXAMPLE r10a_2: Nov. 2001 // EXAMPLE r10b_1: Mai and Juni 2011 (find Mai 2001) -RULENAME="date_r10a",EXTRACTION="(%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(1))" -RULENAME="date_r10b",EXTRACTION="(%reMonthLong|%reMonthShort)%reAndOrTo(%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(8)-%normMonth(group(1))",OFFSET="group(1)-group(1)" +RULENAME="date_r10a",EXTRACTION="(?:%(reMonthLong|reMonthShort)%reAndOrTo)?%(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(3))",OFFSET="group(3)-group(4)" +RULENAME="date_r10b",EXTRACTION="%(reMonthLong|reMonthShort)%reAndOrTo%(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(1))",OFFSET="group(1)-group(1)" // date_r11 // EXAMPLE r11a_1: November diesen Jahres @@ -159,8 +159,8 @@ RULENAME="date_r11a",EXTRACTION="%reMonthLong %reThisNextLast Jahr[es]*",NORM_VA // EXAMPLE r12b_1: Sommer 2001 // EXAMPLE r12c_1: Sommer 69 RULENAME="date_r12a",EXTRACTION="%reSeason",NORM_VALUE="UNDEF-year-%normSeason(group(1))" -RULENAME="date_r12b",EXTRACTION="%reSeason( des Jahres | )%reYear4Digit",NORM_VALUE="group(3)-%normSeason(group(1))" -RULENAME="date_r12c",EXTRACTION="%reSeason( des Jahres | )%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(3)-%normSeason(group(1))" +RULENAME="date_r12b",EXTRACTION="%reSeason(?: des Jahres|) %reYear4Digit",NORM_VALUE="group(2)-%normSeason(group(1))" +RULENAME="date_r12c",EXTRACTION="%reSeason(?: des Jahres|) %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normSeason(group(1))" ////////////////////////////// // PART-OF-YEAR GRANULARITY // @@ -168,9 +168,9 @@ RULENAME="date_r12c",EXTRACTION="%reSeason( des Jahres | )%reYear2Digit",NORM_VA // date_r13 // EXAMPLE r13a_1: das erste Quartal 2001 // EXAMPLE r13a_1: das erste Quartal -RULENAME="date_r13a",EXTRACTION="([Dd]as |[Dd]er |[Dd]ie )?%rePartOfYear (des Jahres )?%reYear4Digit",NORM_VALUE="group(4)-%normPartOfYear(group(2))" -RULENAME="date_r13b",EXTRACTION="([Dd]as |[Dd]er |[Dd]ie )?%rePartOfYear",NORM_VALUE="UNDEF-year-%normPartOfYear(group(2))" -RULENAME="date_r13c",EXTRACTION="([Dd]as |[Dd]er |[Dd]ie )?%rePartOfYear des Vorjahres",NORM_VALUE="UNDEF-REF-year-MINUS-1%normPartOfYear(group(2))" +RULENAME="date_r13a",EXTRACTION="(?:[Dd]as |[Dd]er |[Dd]ie |)%rePartOfYear (des Jahres )?%reYear4Digit",NORM_VALUE="group(3)-%normPartOfYear(group(1))" +RULENAME="date_r13b",EXTRACTION="(?:[Dd]as |[Dd]er |[Dd]ie |)%rePartOfYear",NORM_VALUE="UNDEF-year-%normPartOfYear(group(1))" +RULENAME="date_r13c",EXTRACTION="(?:[Dd]as |[Dd]er |[Dd]ie |)%rePartOfYear des Vorjahres",NORM_VALUE="UNDEF-REF-year-MINUS-1%normPartOfYear(group(1))" ////////////////////// // Year Granularity // @@ -178,7 +178,7 @@ RULENAME="date_r13c",EXTRACTION="([Dd]as |[Dd]er |[Dd]ie )?%rePartOfYear des Vor // date_r14 // EXAMPLE r14a_1: 2009 // EXAMPLE r14a_2: Jahr 2009 -RULENAME="date_r14a",EXTRACTION="(Jahr[es]* )?%reYear4Digit",NORM_VALUE="group(2)" +RULENAME="date_r14a",EXTRACTION="(?:Jahr[es]* |)%reYear4Digit",NORM_VALUE="group(1)" //date_r15 // EXAMPLE r15a_1: 1850-58 (find: 1858) @@ -187,7 +187,7 @@ RULENAME="date_r15a",EXTRACTION="%reYear4Digit%reAndOrTo%reYear2Digit",NORM_VALU // date_r16 // EXAMPLE r16a_1: neunzehnsechsundneuzig -RULENAME="date_r16a",EXTRACTION="%reNumWordTeen( |-|)%reNumWord2D",NORM_VALUE="%normDurationNumber(group(1))%normDurationNumber(group(3))" +RULENAME="date_r16a",EXTRACTION="%reNumWordTeen(?: |-|)%reNumWord2D",NORM_VALUE="%normDurationNumber(group(1))%normDurationNumber(group(2))" ///////////////////////// // Century Granularity // @@ -196,8 +196,9 @@ RULENAME="date_r16a",EXTRACTION="%reNumWordTeen( |-|)%reNumWord2D",NORM_VALUE="% // EXAMPLE r17a_1: Das 20. Jahrhundert // EXAMPLE r17b_1: Im 18. und 19. Jahrhundert (find: 17. Jahrhundert) // EXAMPLE 2: the seventh century -RULENAME="date_r17a",EXTRACTION="(Das )?(%reDayNumber[\.]?|%reDayWordTh) Jahrhundert[s]?",NORM_VALUE="%normDay(%SUM%(%normDay(group(2)),-1))" -RULENAME="date_r17b",EXTRACTION="(Das )?(%reDayNumber[\.]?|%reDayWordTh)%reAndOrTo(%reDayNumber[\.]?|%reDayWordTh) Jahrhundert[s]?",NORM_VALUE="%normDay(%SUM%(%normDay(group(2)),-1))",OFFSET="group(2)-group(2)" +RULENAME="date_r17a",EXTRACTION="(?:Das )?%(reDayNumber|reDayNumberTh|reDayWordTh) Jahrhunderts?",NORM_VALUE="%normDay(%SUM%(%normDay(group(1)),-1))" +RULENAME="date_r17b",EXTRACTION="(?:Das )?%(reDayNumber|reDayNumberTh|reDayWordTh)%reAndOrTo%(reDayNumber|reDayNumberTh|reDayWordTh) Jahrhunderts?",NORM_VALUE="%normDay(%SUM%(%normDay(group(1)),-1))",OFFSET="group(1)-group(1)" +RULENAME="date_r17c",EXTRACTION="%rePartWords des %(reDayNumber|reDayNumberTh|reDayWordTh) Jahrhunderts?",NORM_VALUE="%normDay(%SUM%(%normDay(group(2)),-1))",NORM_MOD="%normPartWords(group(1))" /////////////////////////////////// // GRANULARITY INDEPENDENT RULES // @@ -207,22 +208,22 @@ RULENAME="date_r17b",EXTRACTION="(Das )?(%reDayNumber[\.]?|%reDayWordTh)%reAndOr // EXAMPLE r18b_1: Anfang 1999 // EXAMPLE r18c_1: Anfang November 1999 // EXAMPLE r18d_1: Anfang November 2000 -RULENAME="date_r18a",EXTRACTION="(%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(1))" -RULENAME="date_r18b",EXTRACTION="%rePartWords( |)%reYear4Digit",NORM_VALUE="group(3)",NORM_MOD="%normPartWords(group(1))" -RULENAME="date_r18c",EXTRACTION="%rePartWords( |)(%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(6)-%normMonth(group(3))",NORM_MOD="%normPartWords(group(1))" -RULENAME="date_r18d",EXTRACTION="%rePartWords( |)(%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(3))",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_r18a",EXTRACTION="(?:%reApproximate |)%(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(2))" +RULENAME="date_r18b",EXTRACTION="(?:%reApproximate |)%rePartWords ?%reYear4Digit",NORM_VALUE="group(3)",NORM_MOD="%normPartWords(group(2))" +RULENAME="date_r18c",EXTRACTION="(?:%reApproximate |)%rePartWords ?%(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(3))",NORM_MOD="%normPartWords(group(2))" +RULENAME="date_r18d",EXTRACTION="(?:%reApproximate |)%rePartWords ?%(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(3))",NORM_MOD="%normPartWords(group(2))" // date_r19 // EXAMPLE r19a_1: die 1920er Jahre // EXAMPLE r19b_1: die 20er Jahre // EXAMPLE r19a_1: die frühen 1920er Jahre // EXAMPLE r19b_1: die frühen 20er Jahre -RULENAME="date_r19a",EXTRACTION="([Dd]ie |[Dd]en )?%reYear4Digit(ern|er)( Jahren?|-Jahren?)?",NORM_VALUE="%SUBSTRING%(group(2),0,3)" -RULENAME="date_r19b",EXTRACTION="([Dd]ie |[Dd]en )?%reYear2Digit(ern|er)( Jahren?|-Jahren?)?",NORM_VALUE="UNDEF-century%SUBSTRING%(group(2),0,1)" -RULENAME="date_r19c",EXTRACTION="([Dd]ie |[Dd]en )?%rePartWords([ ]?)%reYear4Digit(ern|er)( Jahren?|-Jahren?)?",NORM_VALUE="%SUBSTRING%(group(4),0,3)",NORM_MOD="%normPartWords(group(2))" -RULENAME="date_r19d",EXTRACTION="([Dd]ie |[Dd]en )?%rePartWords([ ]?)%reYear2Digit(ern|er)( Jahren?|-Jahren?)?",NORM_VALUE="UNDEF-century%SUBSTRING%(group(4),0,1)",NORM_MOD="%normPartWords(group(2))" -RULENAME="date_r19e",EXTRACTION="([Dd]ie |[Dd]en )?%reDecadeWord( [Jj]ahren?|jahren?)?",NORM_VALUE="UNDEF-century%normDecadeWord(group(2))" -RULENAME="date_r19f",EXTRACTION="([Dd]ie |[Dd]en )?%rePartWords der %reDecadeWord( [Jj]ahren?|jahren?)?",NORM_VALUE="UNDEF-century%normDecadeWord(group(3))",NORM_MOD="%normPartWords(group(2))" +RULENAME="date_r19a",EXTRACTION="(?:[Dd]ie |[Dd]en )?%reYear4Digit(?:ern|er)(?: Jahren?|-Jahren?|)",NORM_VALUE="%SUBSTRING%(group(1),0,3)" +RULENAME="date_r19b",EXTRACTION="(?:[Dd]ie |[Dd]en )?%reYear2Digit(?:ern|er)(?: Jahren?|-Jahren?|)",NORM_VALUE="UNDEF-century%SUBSTRING%(group(1),0,1)" +RULENAME="date_r19c",EXTRACTION="(?:[Dd]ie |[Dd]en )?%rePartWords ?%reYear4Digit(?:ern|er)(?: Jahren?|-Jahren?|)",NORM_VALUE="%SUBSTRING%(group(2),0,3)",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_r19d",EXTRACTION="(?:[Dd]ie |[Dd]en )?%rePartWords ?%reYear2Digit(?:ern|er)(?: Jahren?|-Jahren?|)",NORM_VALUE="UNDEF-century%SUBSTRING%(group(2),0,1)",NORM_MOD="%normPartWords(group(1))" +RULENAME="date_r19e",EXTRACTION="(?:[Dd]ie |[Dd]en )?%reDecadeWord(?: [Jj]ahren?|jahren?|)",NORM_VALUE="UNDEF-century%normDecadeWord(group(1))" +RULENAME="date_r19f",EXTRACTION="(?:[Dd]ie |[Dd]en )?%rePartWords der %reDecadeWord(?: [Jj]ahren?|-[Jj]ahren?|)",NORM_VALUE="UNDEF-century%normDecadeWord(group(2))",NORM_MOD="%normPartWords(group(1))" // date_r20 // EXAMPLE r20a_1: dieses Jahr @@ -230,8 +231,8 @@ RULENAME="date_r19f",EXTRACTION="([Dd]ie |[Dd]en )?%rePartWords der %reDecadeWor // EXAMPLE r20c_1: diesen November // EXAMPLE r20d_1: diesen Montag // EXAMPLE r20e_1: diesen Sommer -RULENAME="date_r20a",EXTRACTION="([Dd]er |[Dd]ie |[Dd]as )?%reThisNextLast %reUnit",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-%normUnit(group(3))" -RULENAME="date_r20b",EXTRACTION="([Ss]elbe[nrs]?|[Gg]leiche[nrs]?) Tag",NORM_VALUE="UNDEF-REF-day-PLUS-0" +RULENAME="date_r20a",EXTRACTION="(?:[Dd]er |[Dd]ie |[Dd]as |[Dd]es |[Ii]m |[Aa]m )?%reThisNextLast %reUnit(?:e?s?)",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-%normUnit(group(2))" +RULENAME="date_r20b",EXTRACTION="(?:[Aa]m |)(?:[Ss]elbe[nrs]?|[Gg]leiche[nrs]?) Tag",NORM_VALUE="UNDEF-REF-day-PLUS-0" RULENAME="date_r20c",EXTRACTION="%reThisNextLast %reMonthLong",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-%normMonthToEnglish(group(2))" RULENAME="date_r20d",EXTRACTION="%reThisNextLast %reWeekday",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-%normWeekday(group(2))" RULENAME="date_r20e",EXTRACTION="%reThisNextLast %reSeason",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-%normSeason(group(2))" @@ -248,19 +249,19 @@ RULENAME="date_r21d",EXTRACTION="%rePartWords %reThisNextLast %reSeason[nes]*",N // date_r22 // EXAMPLE r22a_1: letztes Wochenende -RULENAME="date_r22a",EXTRACTION="([Dd]ieses |[Ll]etztes )Wochenende",NORM_VALUE="UNDEF-last-week-WE" +RULENAME="date_r22a",EXTRACTION="(?:[Dd]ieses |[Ll]etztes )Wochenende",NORM_VALUE="UNDEF-last-week-WE" // date_r23 // EXAMPLE r23a_1: das letztjährige Quartal // EXAMPLE r23b_1: das Quartal -RULENAME="date_r23a",EXTRACTION="([Dd]er|[Dd]ie|[Dd]as) (letztjährige) Quartals?",NORM_VALUE="UNDEF-REF-quarter-MINUS-4" -RULENAME="date_r23b",EXTRACTION="([Dd]er|[Dd]ie|[Dd]as) Quartals?",NORM_VALUE="UNDEF-REF-quarter-PLUS-0" +RULENAME="date_r23a",EXTRACTION="(?:[Dd]er|[Dd]ie|[Dd]as) letztjährige Quartals?",NORM_VALUE="UNDEF-REF-quarter-MINUS-4" +RULENAME="date_r23b",EXTRACTION="(?:[Dd]er|[Dd]ie|[Dd]as) Quartals?",NORM_VALUE="UNDEF-REF-quarter-PLUS-0" // date_r24 // EXAMPLE r24a_1: ein Jahr früher // EXAMPLE r24b_2: ein Jahr später -RULENAME="date_r24a",EXTRACTION="[Ee]in Jahr (früher|vorher|davor)",NORM_VALUE="UNDEF-REF-year-MINUS-1" -RULENAME="date_r24b",EXTRACTION="[Ee]in Jahr (später|danach)",NORM_VALUE="UNDEF-REF-year-PLUS-1" +RULENAME="date_r24a",EXTRACTION="[Ee]in Jahr (?:früher|vorher|davor|vor)",NORM_VALUE="UNDEF-REF-year-MINUS-1" +RULENAME="date_r24b",EXTRACTION="[Ee]in Jahr (?:später|danach|nach)",NORM_VALUE="UNDEF-REF-year-PLUS-1" // date_r25 // EXAMPLE r25a_1: etwa zehn Tage später @@ -269,12 +270,12 @@ RULENAME="date_r24b",EXTRACTION="[Ee]in Jahr (später|danach)",NORM_VALUE="UNDEF // EXAMPLE r25d_1: etwa zehn Tage früher // EXAMPLE r25e_1: etwa 20 Tage früher // EXAMPLE r25f_1: etwa ein Tag früher -RULENAME="date_r26a",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D) (%reUnit|Minuten|Stunden) (später|danach)",NORM_VALUE="UNDEF-REF-%normUnit(group(6))-PLUS-%normDurationNumber(group(3))" -RULENAME="date_r26b",EXTRACTION="(%reApproximate )?([\d]+) (%reUnit|Minuten|Stunden) (später|danach)",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-PLUS-group(3)" -RULENAME="date_r26c",EXTRACTION="(%reApproximate )?([Ee]inen|[Ee]ine|[Ee]in) (%reUnit) (später|danach)",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-PLUS-1" -RULENAME="date_r26d",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D) (%reUnit|Minuten|Stunden) (früher|vorher|davor)",NORM_VALUE="UNDEF-REF-%normUnit(group(6))-MINUS-%normDurationNumber(group(3))" -RULENAME="date_r26e",EXTRACTION="(%reApproximate )?([\d]+) (%reUnit|Minuten|Stunden) (früher|vorher|davor)",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-MINUS-group(3)" -RULENAME="date_r26f",EXTRACTION="(%reApproximate )?([Ee]inen|[Ee]ine|[Ee]in) (%reUnit) (früher|vorher|davor)",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-MINUS-1" +RULENAME="date_r26a",EXTRACTION="(?:%reApproximate |)%(reNumWord1D|reNumWord2D) %reUnitFine (?:später|danach|nach)",NORM_VALUE="UNDEF-REF-%normUnit(group(3))-PLUS-%normDurationNumber(group(2))" +RULENAME="date_r26b",EXTRACTION="(?:%reApproximate |)(\d+) %reUnitFine (?:später|danach|nach)",NORM_VALUE="UNDEF-REF-%normUnit(group(3))-PLUS-group(2)" +RULENAME="date_r26c",EXTRACTION="(?:%reApproximate |)(?:[Ee]inen|[Ee]ine|[Ee]in) %reUnitFine (?:später|danach|nach)",NORM_VALUE="UNDEF-REF-%normUnit(group(2))-PLUS-1" +RULENAME="date_r26d",EXTRACTION="(?:%reApproximate |)%(reNumWord1D|reNumWord2D) %reUnitFine (?:früher|vorher|davor|vor)",NORM_VALUE="UNDEF-REF-%normUnit(group(3))-MINUS-%normDurationNumber(group(2))" +RULENAME="date_r26e",EXTRACTION="(?:%reApproximate |)(\d+) %reUnitFine (?:früher|vorher|davor|vor)",NORM_VALUE="UNDEF-REF-%normUnit(group(3))-MINUS-group(2)" +RULENAME="date_r26f",EXTRACTION="(?:%reApproximate |)(?:[Ee]inen|[Ee]ine|[Ee]in) %reUnitFine (?:früher|vorher|davor|vor)",NORM_VALUE="UNDEF-REF-%normUnit(group(2))-MINUS-1" /////////////////// // HOLIDAY RULES // @@ -298,27 +299,30 @@ RULENAME="date_r28c",EXTRACTION="%reHolidayVar %reYear2Digit",NORM_VALUE="UNDEF- // PAST, PRESENT, FUTURE expressions -RULENAME="date_r29a",EXTRACTION="([Ee]inigen?|[Ww]enigen?|[Vv]ielen?) %reUnit (später)",NORM_VALUE="FUTURE_REF" -RULENAME="date_r29b",EXTRACTION="([Ee]inigen?|[Ww]enigen?|[Vv]ielen?) %reUnit (früher)",NORM_VALUE="PAST_REF" +RULENAME="date_r29a",EXTRACTION="(?:[Ee]inigen?|[Ww]enigen?|[Vv]ielen?) %reUnit später",NORM_VALUE="FUTURE_REF" +RULENAME="date_r29b",EXTRACTION="(?:[Ee]inigen?|[Ww]enigen?|[Vv]ielen?) %reUnit früher",NORM_VALUE="PAST_REF" //////////////////// // NEGATIVE RULES // //////////////////// // CHECK THESE RULES (the German tag set is not equal to the English tag set) -RULENAME="date_r1a_negative",EXTRACTION="(2[3456789]\d\d)",NORM_VALUE="REMOVE" -RULENAME="date_r1b1_negative",EXTRACTION="%reYear4Digit ([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):NN:" -RULENAME="date_r1b2_negative",EXTRACTION="%reYear4Digit (respektive|von|oder) (%reYear4Digit|[\d]+) ([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(5):NN:" -RULENAME="date_r1c1_negative",EXTRACTION="%reYear4Digit ([\S]+) ([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):ADJA:group(3):NN:" -RULENAME="date_r1c2_negative",EXTRACTION="%reYear4Digit ([\S]+) ([\S]+) ([\S]+) ([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):ADJA:group(3):KON:group(4):ADJA:group(5):NN:" -RULENAME="date_r1d_negative",EXTRACTION="%reYear4Digit (m\b|km\b|ft\b|yr\b|dg\b|cm\b|ha\b|sq\b|PS\b)",NORM_VALUE="REMOVE" +RULENAME="date_r1a_negative",EXTRACTION="2[3456789]\d\d",NORM_VALUE="REMOVE" +// March 2017 reduced range: many missing matches e.g. in Heidelberg University Wiki article. +RULENAME="date_r1b1_negative",EXTRACTION="(?:1[0-7]\d\d|0\d\d\d|2[1-9]\d\d) (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):NN:" +RULENAME="date_r1c1_negative",EXTRACTION="(?:1[0-7]\d\d|0\d\d\d|2[1-9]\d\d) (\S+) (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):ADJA:group(2):NN:" +RULENAME="date_r1c2_negative",EXTRACTION="(?:1[0-7]\d\d|0\d\d\d|2[1-9]\d\d) (\S+) (\S+) (\S+) (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):ADJA:group(2):KON:group(3):ADJA:group(4):NN:" +//RULENAME="date_r1b1_negative",EXTRACTION="%reYear4Digit (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):NN:" +RULENAME="date_r1b2_negative",EXTRACTION="%reYear4Digit (?:respektive|von|oder) \d+ (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):NN:" +//RULENAME="date_r1c1_negative",EXTRACTION="%reYear4Digit (\S+) (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):ADJA:group(3):NN:" +//RULENAME="date_r1c2_negative",EXTRACTION="%reYear4Digit (\S+) (\S+) (\S+) (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):ADJA:group(3):KON:group(4):ADJA:group(5):NN:" +RULENAME="date_r1d_negative",EXTRACTION="%reYear4Digit (?:m\b|km\b|ft\b|yr\b|dg\b|cm\b|ha\b|sq\b|PS\b|Euro)",NORM_VALUE="REMOVE" // Further negative rules (March 2015, Jannik Strötgen) // typical fairy tail expression // EXAMPLE r1e1_negative: Märchen aus 1001 Nacht // EXAMPLE r1e2_negative: 1001-Nacht -RULENAME="date_r1e1_negative",EXTRACTION="Märchen aus 1001 Nacht",NORM_VALUE="REMOVE" -RULENAME="date_r1e2_negative",EXTRACTION="1001(-| )Nacht",NORM_VALUE="REMOVE" +RULENAME="date_r1e1_negative",EXTRACTION="(?:Märchen aus)? 1001[ -]Nacht",NORM_VALUE="REMOVE" // Further negative rules (March 2015, Jannik Strötgen) // Addresses and room numbers @@ -326,20 +330,20 @@ RULENAME="date_r1e2_negative",EXTRACTION="1001(-| )Nacht",NORM_VALUE="REMOVE" // EXAMPLE r2b_negative: 1010 Wien // EXAMPLE r2c_negative: 1600 Pennsylvania Avenue // EXAMPLE r2d_negative: Sitzungssaal 1901 -RULENAME="date_r2a_negative",EXTRACTION="([A-Z][\S]+)?([Ss]traße|[Ss]trasse|[Gg]asse|[Ww]eg) (\d)+, %reYear4Digit ([A-Z][\S]+\b)",NORM_VALUE="REMOVE" -RULENAME="date_r2b_negative",EXTRACTION="%reYear4Digit (Wien|Graz|Linz|Salzburg|Innsbruck)",NORM_VALUE="REMOVE" -RULENAME="date_r2c_negative",EXTRACTION="%reYear4Digit [A-Z]([\S]+) (Avenue|Street)",NORM_VALUE="REMOVE" -RULENAME="date_r2d_negative",EXTRACTION="([A-Z][\S]+)?([Ss]aal|[Rr]aum|[Zz]immer) %reYear4Digit",NORM_VALUE="REMOVE" +RULENAME="date_r2a_negative",EXTRACTION="(?:[A-Z]\S+)?(?:[Ss]traße|[Ss]trasse|[Gg]asse|[Ww]eg) \d+, %reYear4Digit [A-Z]\S+\b",NORM_VALUE="REMOVE" +RULENAME="date_r2b_negative",EXTRACTION="%reYear4Digit (?:Wien|Graz|Linz|Salzburg|Innsbruck)",NORM_VALUE="REMOVE" +RULENAME="date_r2c_negative",EXTRACTION="%reYear4Digit [A-Z]\S+ (?:Avenue|Street|Ave)",NORM_VALUE="REMOVE" +RULENAME="date_r2d_negative",EXTRACTION="(?:[A-Z]\S+)?(?:[Ss]aal|[Rr]aum|[Zz]immer) %reYear4Digit",NORM_VALUE="REMOVE" // Further negative rules (March 2015, Jannik Strötgen) // EXAMPLE r3a_negative: 1200 davon [sind tot] // EXAMPLE r3b_negative: mindestens 2000 [sind tot] // EXAMPLE r3c_negative: von 2000 auf 1800 [reduziert] -RULENAME="date_r3a_negative",EXTRACTION="%reYear4Digit (davon)",NORM_VALUE="REMOVE" -RULENAME="date_r3b_negative",EXTRACTION="([Üü]ber|[Dd]arunter|[Dd]avon|[Kk]napp|[Ww]eniger als|[Mm]ehr als|[Rr]und|[Mm]indestens|[Hh]öchstens|[Mm]aximal|[Ww]eitere) %reYear4Digit",NORM_VALUE="REMOVE" -RULENAME="date_r3c_negative",EXTRACTION="(von|um) (%reYear4Digit|[\d]+) (auf) (%reYear4Digit|[\d]+)",NORM_VALUE="REMOVE" +RULENAME="date_r3a_negative",EXTRACTION="%reYear4Digit davon",NORM_VALUE="REMOVE" +RULENAME="date_r3b_negative",EXTRACTION="(?:[Üü]ber|[Dd]arunter|[Dd]avon|[Kk]napp|[Ww]eniger als|[Mm]ehr als|[Rr]und|[Mm]indestens|[Hh]öchstens|[Mm]aximal|[Ww]eitere) %reYear4Digit",NORM_VALUE="REMOVE" +RULENAME="date_r3c_negative",EXTRACTION="(?:von|um) \d+ auf \d+",NORM_VALUE="REMOVE" // Further negative rules (March 2015, Jannik Strötgen) // EXAMPLE r4a_negative: UN Resolution 1441 -RULENAME="date_r4a_negative",EXTRACTION="([Rr]esolution|[Ee]ntschließung) %reYear4Digit",NORM_VALUE="REMOVE" +RULENAME="date_r4a_negative",EXTRACTION="(?:[Rr]esolution|[Ee]ntschließung) %reYear4Digit",NORM_VALUE="REMOVE" diff --git a/resources/german/rules/resources_rules_durationrules.txt b/resources/german/rules/resources_rules_durationrules.txt index 9e334e63..8513a366 100644 --- a/resources/german/rules/resources_rules_durationrules.txt +++ b/resources/german/rules/resources_rules_durationrules.txt @@ -10,16 +10,16 @@ // duration_r1 // EXAMPLE r1a_1: etwa fünf Tage // EXAMPLE r1b_1: etwa 20 Tage -// EXAMPLE r1c_1: etwa fünf Stunden -// EXAMPLE r1d_1: etwa 20 Stunden -RULENAME="duration_r1a1",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D) %reUnit",NORM_VALUE="P%normDurationNumber(group(3))%normUnit4Duration(group(6))" -RULENAME="duration_r1b1",EXTRACTION="(%reApproximate )?([\d]+) %reUnit",NORM_VALUE="Pgroup(3)%normUnit4Duration(group(4))" -RULENAME="duration_r1c1",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D) (Minuten?|Stunden?)",NORM_VALUE="PT%normDurationNumber(group(3))%normUnit4Duration(group(6))" -RULENAME="duration_r1d1",EXTRACTION="(%reApproximate )?([\d]+) (Minuten?|Stunden?)",NORM_VALUE="PTgroup(3)%normUnit4Duration(group(4))" -RULENAME="duration_r1a2",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D)%reAndOrTo(%reNumWord2D|%reNumWord1D) %reUnit",NORM_VALUE="P%normDurationNumber(group(3))%normUnit4Duration(group(10))",OFFSET="group(0)-group(3)" -RULENAME="duration_r1b2",EXTRACTION="(%reApproximate )?([\d]+)%reAndOrTo([\d]+) %reUnit",NORM_VALUE="Pgroup(3)%normUnit4Duration(group(6))",OFFSET="group(0)-group(3)" -RULENAME="duration_r1c2",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D)%reAndOrTo(%reNumWord2D|%reNumWord1D) (Minuten?|Stunden?)",NORM_VALUE="PT%normDurationNumber(group(3))%normUnit4Duration(group(10))" -RULENAME="duration_r1d2",EXTRACTION="(%reApproximate )?([\d]+)%reAndOrTo([\d]+) (Minuten?|Stunden?)",NORM_VALUE="PTgroup(3)%normUnit4Duration(group(6))" +// EXAMPLE r1a_1: etwa fünf Stunden +// EXAMPLE r1b_1: etwa 20 Stunden +RULENAME="duration_r1a1",EXTRACTION="(?:Nach |)(?:[Dd]en |%reApproximate |)%(reNumWord1D|reNumWord2D) %reUnit",NORM_VALUE="P%normDurationNumber(group(2))%normUnit4Duration(group(3))" +RULENAME="duration_r1b1",EXTRACTION="(?:Nach |)(?:[Dd]en |%reApproximate |)(\d+) %reUnit",NORM_VALUE="Pgroup(2)%normUnit4Duration(group(3))" +RULENAME="duration_r1a2",EXTRACTION="(?:Nach |)(?:[Dd]en |%reApproximate |)%(reNumWord1D|reNumWord2D) ([Ss]tunden?|[Mm]inuten?|[Ss]ekunden?)",NORM_VALUE="PT%normDurationNumber(group(2))%normUnit4Duration(group(3))" +RULENAME="duration_r1b2",EXTRACTION="(?:Nach |)(?:[Dd]en |%reApproximate |)(\d+) ([Ss]tunden?|[Mm]inuten?|[Ss]ekunden?)",NORM_VALUE="PTgroup(2)%normUnit4Duration(group(3))" +RULENAME="duration_r1c1",EXTRACTION="(?:Nach |)(?:[Dd]en |%reApproximate |)%(reNumWord1D|reNumWord2D)%reAndOrTo%(reNumWord1D|reNumWord2D) %reUnit",NORM_VALUE="P%normDurationNumber(group(2))%normUnit4Duration(group(5))",OFFSET="group(0)-group(2)" +RULENAME="duration_r1d1",EXTRACTION="(?:Nach |)(?:[Dd]en |%reApproximate |)(\d+)%reAndOrTo(\d+) %reUnit",NORM_VALUE="Pgroup(2)%normUnit4Duration(group(5))",OFFSET="group(0)-group(2)" +RULENAME="duration_r1c2",EXTRACTION="(?:Nach |)(?:[Dd]en |%reApproximate |)%(reNumWord1D|reNumWord2D)%reAndOrTo%(reNumWord1D|reNumWord2D) ([Ss]tunden?|[Mm]inuten?|[Ss]ekunden?)",NORM_VALUE="PT%normDurationNumber(group(2))%normUnit4Duration(group(5))",OFFSET="group(0)-group(2)" +RULENAME="duration_r1d2",EXTRACTION="(?:Nach |)(?:[Dd]en |%reApproximate |)(\d+)%reAndOrTo(\d+) ([Ss]tunden?|[Mm]inuten?|[Ss]ekunden?)",NORM_VALUE="TPgroup(2)%normUnit4Duration(group(5))",OFFSET="group(0)-group(2)" // duration_r2 // EXAMPLE r2a_1: die nächsten zwanzig Tage @@ -28,28 +28,30 @@ RULENAME="duration_r1d2",EXTRACTION="(%reApproximate )?([\d]+)%reAndOrTo([\d]+) // EXAMPLE r2d_1: die nächsten zwanzig Minuten // EXAMPLE r2e_1: die nächsten 20 Minuten // EXAMPLE r2f_1: die nächsten paar Minuten -RULENAME="duration_r2a",EXTRACTION="(%reApproximate )?die %reThisNextLast (%reNumWord2D|%reNumWord1D)( |-)%reUnit( oder so)?",NORM_VALUE="P%normDurationNumber(group(4))%normUnit4Duration(group(8))" -RULENAME="duration_r2b",EXTRACTION="(%reApproximate )?die %reThisNextLast ([\d]+)( |-)%reUnit( oder so)?",NORM_VALUE="Pgroup(4)%normUnit4Duration(group(6))" -RULENAME="duration_r2c",EXTRACTION="(%reApproximate )?die %reThisNextLast (paar|wenigen?|einigen?) %reUnit( oder so)?",NORM_VALUE="PX%normUnit4Duration(group(5))" -RULENAME="duration_r2d",EXTRACTION="(%reApproximate )?die %reThisNextLast (%reNumWord2D|%reNumWord1D)( |-)(Minuten?|Stunden?)( oder so)?",NORM_VALUE="PT%normDurationNumber(group(4))%normUnit4Duration(group(8))" -RULENAME="duration_r2e",EXTRACTION="(%reApproximate )?die %reThisNextLast ([\d]+)( |-)(Minuten?|Stunden?)( oder so)?",NORM_VALUE="PTgroup(4)%normUnit4Duration(group(6))" -RULENAME="duration_r2f",EXTRACTION="(%reApproximate )?die %reThisNextLast (paar|wenigen?|einigen?) (Minuten?|Stunden?)( oder so)?",NORM_VALUE="PTX%normUnit4Duration(group(5))" +RULENAME="duration_r2a",EXTRACTION="(?:%reApproximate |)die %reThisNextLast %(reNumWord1D|reNumWord2D)[ -]%reUnit(?: oder so)?",NORM_VALUE="P%normDurationNumber(group(3))%normUnit4Duration(group(4))" +RULENAME="duration_r2b",EXTRACTION="(?:%reApproximate |)die %reThisNextLast (\d+)[ -]%reUnit(?: oder so)?",NORM_VALUE="Pgroup(3)%normUnit4Duration(group(4))" +RULENAME="duration_r2c",EXTRACTION="(?:%reApproximate |)die %reThisNextLast (?:paar|wenigen?|einigen?) %reUnit(?: oder so)?",NORM_VALUE="PX%normUnit4Duration(group(3))" +// Note: PTX, not PT; subtle difference +RULENAME="duration_r2d",EXTRACTION="(?:%reApproximate |)die %reThisNextLast %(reNumWord1D|reNumWord2D)[ -]([Ss]tunden?|[Mm]inuten?|[Ss]ekunden?)(?: oder so)?",NORM_VALUE="PT%normDurationNumber(group(3))%normUnit4Duration(group(4))" +RULENAME="duration_r2e",EXTRACTION="(?:%reApproximate |)die %reThisNextLast (\d+)[ -]([Ss]tunden?|[Mm]inuten?|[Ss]ekunden?)(?: oder so)?",NORM_VALUE="PTgroup(3)%normUnit4Duration(group(4))" +RULENAME="duration_r2f",EXTRACTION="(?:%reApproximate |)die %reThisNextLast (?:paar|wenigen?|einigen?) ([Ss]tunden?|[Mm]inuten?|[Ss]ekunden?)(?: oder so)?",NORM_VALUE="PTX%normUnit4Duration(group(3))" // duration_r3 // EXAMPLE r3a_1: ein Jahr -// EXAMPLE r3b_1: eine Stunde +// EXAMPLE r3a_1: eine Stunde // EXAMPLE r3c_1: 20-tägig // EXAMPLE r3d_1: 20-stündig -RULENAME="duration_r3a",EXTRACTION="ein(|e|es|em|er|en)( |-)%reUnit",NORM_VALUE="P1%normUnit4Duration(group(3))" -RULENAME="duration_r3b",EXTRACTION="einer?( |-)(Minuten?|Stunden?)",NORM_VALUE="PT1%normUnit4Duration(group(2))" -RULENAME="duration_r3c",EXTRACTION="([\d]+)( |-)tägige?[ns]?",NORM_VALUE="PTgroup(1)D" -RULENAME="duration_r3d",EXTRACTION="([\d]+)( |-)stündige?[ns]?",NORM_VALUE="PTgroup(1)D" +RULENAME="duration_r3a",EXTRACTION="ein(?:e|es|em|er|en|)[ -]%reUnit",NORM_VALUE="P1%normUnit4Duration(group(1))" +RULENAME="duration_r3c",EXTRACTION="(\d+)[ -]tägige?[ns]?",NORM_VALUE="Pgroup(1)D" +RULENAME="duration_r3d",EXTRACTION="(\d+)[ -]stündige?[ns]?",NORM_VALUE="PTgroup(1)H" +RULENAME="duration_r3e",EXTRACTION="(\d+)[ -]minütige?[ns]?",NORM_VALUE="PTgroup(1)M" +RULENAME="duration_r3f",EXTRACTION="(\d+)[ -]sekündige?[ns]?",NORM_VALUE="PTgroup(1)S" // reUnitPlural -RULENAME="duration_r4a",EXTRACTION="[Dd](ie|en) ([\S]+) %reUnitPlural",NORM_VALUE="PX%normUnit4Duration(group(3))","POS_CONSTRAINT="group(2):ADJA:" -RULENAME="duration_r4b",EXTRACTION="[Dd](ie|en) %reUnitPlural",NORM_VALUE="PX%normUnit4Duration(group(2))" -RULENAME="duration_r4c",EXTRACTION="([Ee]inigen?|[Ww]enigen?|[Vv]ielen?) %reUnitPlural",NORM_VALUE="PX%normUnit4Duration(group(2))" -RULENAME="duration_r4d",EXTRACTION="([Ss]eit) %reUnitPlural",NORM_VALUE="PX%normUnit4Duration(group(2))",OFFSET="group(2)-group(2)" +RULENAME="duration_r4a",EXTRACTION="[Dd](?:ie|en) (\S+) %reUnitPlural",NORM_VALUE="PX%normUnit4Duration(group(2))",POS_CONSTRAINT="group(2):ADJA:" +RULENAME="duration_r4b",EXTRACTION="[Dd](?:ie|en) %reUnitPlural",NORM_VALUE="PX%normUnit4Duration(group(1))" +RULENAME="duration_r4c",EXTRACTION="(?:[Ee]inigen?|[Ww]enigen?|[Vv]ielen?) %reUnitPlural",NORM_VALUE="PX%normUnit4Duration(group(1))" +RULENAME="duration_r4d",EXTRACTION="(?:[Ss]eit) %reUnitPlural",NORM_VALUE="PX%normUnit4Duration(group(1))",OFFSET="group(1)-group(1)" //////////////////// // NEGATIVE RULES // @@ -58,7 +60,7 @@ RULENAME="duration_r4d",EXTRACTION="([Ss]eit) %reUnitPlural",NORM_VALUE="PX%norm // EXAMPLE r1a_negation_1: zwanzig Jahre alt // EXAMPLE r1b_negation_1: 20 Jahre alt // EXAMPLE r1c_negation_1: einige Jahre alt -RULENAME="duration_r1a_negation",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D) (%reUnit|Minuten?|Stunden?) (älter|jünger|alt|jung)",NORM_VALUE="REMOVE" -RULENAME="duration_r1b_negation",EXTRACTION="(%reApproximate )?([\d]+) (%reUnit|Minuten?|Stunden?) (älter|jünger|alt|jung)",NORM_VALUE="REMOVE" -RULENAME="duration_r1c_negation",EXTRACTION="(%reApproximate )?(einige) %reUnit (älter|jünger|alt|jung)",NORM_VALUE="REMOVE" +RULENAME="duration_r1a_negation",EXTRACTION="(?:%reApproximate |)%(reNumWord1D|reNumWord2D) %reUnitFine (?:älter|jünger|alt|jung)",NORM_VALUE="REMOVE" +RULENAME="duration_r1b_negation",EXTRACTION="(?:%reApproximate |)\d+ %reUnitFine (?:älter|jünger|alt|jung)",NORM_VALUE="REMOVE" +RULENAME="duration_r1c_negation",EXTRACTION="(?:%reApproximate |)einige %reUnit (?:älter|jünger|alt|jung)",NORM_VALUE="REMOVE" diff --git a/resources/german/rules/resources_rules_intervalrules.txt b/resources/german/rules/resources_rules_intervalrules.txt index 59e67a15..d0fb6a8c 100644 --- a/resources/german/rules/resources_rules_intervalrules.txt +++ b/resources/german/rules/resources_rules_intervalrules.txt @@ -16,4 +16,4 @@ RULENAME="interval_02",EXTRACTION="(?:[zZ]wischen|[wW]ährend) un RULENAME="interval_03",EXTRACTION="(?:[vV]on)?(?: )?-(?: )?",NORM_VALUE="group(1)-group(2)" RULENAME="interval_04",EXTRACTION=" bis ",NORM_VALUE="group(1)-group(2)" RULENAME="interval_05",EXTRACTION="begann (?:in|im|am) (?:,)? und endete (?:in|im|am) ",NORM_VALUE="group(1)-group(2)" -RULENAME="interval_06",EXTRACTION="/,",NORM_VALUE="group(1)-group(2)" \ No newline at end of file +RULENAME="interval_06",EXTRACTION="/,",NORM_VALUE="group(1)-group(2)" diff --git a/resources/german/rules/resources_rules_setrules.txt b/resources/german/rules/resources_rules_setrules.txt index 43b09df6..72e4a9dd 100644 --- a/resources/german/rules/resources_rules_setrules.txt +++ b/resources/german/rules/resources_rules_setrules.txt @@ -12,10 +12,10 @@ // EXAMPLE 2: every Monday // EXAMPLE 3: each September // EXAMPLE 4: every summer -RULENAME="set_r1a",EXTRACTION="(jede[nrs]?) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(2)),0,1))",NORM_QUANT="EVERY" -RULENAME="set_r1b",EXTRACTION="(jede[nrs]?) %reWeekday",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(2))",NORM_QUANT="EVERY",NORM_FREQ="1W" -RULENAME="set_r1c",EXTRACTION="(jede[nrs]?) (%reMonthLong|%reMonthShort)",NORM_VALUE="XXXX-%normMonth(group(2))",NORM_QUANT="EVERY",NORM_FREQ="1M" -RULENAME="set_r1d",EXTRACTION="(jede[nrs]?) %reSeason",NORM_VALUE="XXXX-%normSeason(group(2))",NORM_QUANT="EVERY",NORM_FREQ="1S" +RULENAME="set_r1a",EXTRACTION="(?:jede[nrs]?) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(1)),0,1))",NORM_QUANT="EVERY" +RULENAME="set_r1b",EXTRACTION="(?:jede[nrs]?) %reWeekday",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(1))",NORM_QUANT="EVERY",NORM_FREQ="1W" +RULENAME="set_r1c",EXTRACTION="(?:jede[nrs]?) %(reMonthLong|reMonthShort)",NORM_VALUE="XXXX-%normMonth(group(1))",NORM_QUANT="EVERY",NORM_FREQ="1M" +RULENAME="set_r1d",EXTRACTION="(?:jede[nrs]?) %reSeason",NORM_VALUE="XXXX-%normSeason(group(1))",NORM_QUANT="EVERY",NORM_FREQ="1S" // set_r2 // EXAMPLE r2a-1: jährlich @@ -27,6 +27,6 @@ RULENAME="set_r2a",EXTRACTION="%reSetWords",NORM_VALUE="%normSetWords(group(1))" // set_r3 // EXAMPLE r3a_1: Montag vormittags // EXAMPLE r3a_1: Montag und Samstag nachts (find: Montag nachts) -RULENAME="set_r3a",EXTRACTION="%reWeekday[ ]?%rePartOfDay[s]",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(1))T%normPartOfDay(group(2))",NORM_FREQ="1W" -RULENAME="set_r3b",EXTRACTION="%reWeekday (und|oder) %reWeekday %rePartOfDay[s]",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(1))T%normPartOfDay(group(4))",NORM_FREQ="1W",OFFSET="group(1)-group(1)" +RULENAME="set_r3a",EXTRACTION="%reWeekday ?%rePartOfDay[s]",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(1))T%normPartOfDay(group(2))",NORM_FREQ="1W" +RULENAME="set_r3b",EXTRACTION="%reWeekday (?:und|oder) %reWeekday %rePartOfDay[s]",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(1))T%normPartOfDay(group(3))",NORM_FREQ="1W",OFFSET="group(1)-group(1)" diff --git a/resources/german/rules/resources_rules_timerules.txt b/resources/german/rules/resources_rules_timerules.txt index 9ab47740..a5253096 100644 --- a/resources/german/rules/resources_rules_timerules.txt +++ b/resources/german/rules/resources_rules_timerules.txt @@ -15,11 +15,11 @@ // EXAMPLE r1c-1: 12/29/2000 20:29 // EXAMPLE r1d-1: 12/29/2000 20:29:29 // EXAMPLE r1e-1: 12/29/2000 20:29:29.79 -RULENAME="time_r1a",EXTRACTION="(%reYear4Digit-%reMonthNumber-%reDayNumber)(T| )%reTimeHour:%reTimeMinute:%reTimeMinute",NORM_VALUE="group(2)-group(3)-group(4)Tgroup(6):group(7):group(8)" -RULENAME="time_r1b",EXTRACTION="(%reYear4Digit-%reMonthNumber-%reDayNumber)(T| )%reTimeHour:%reTimeMinute",NORM_VALUE="group(2)-group(3)-group(4)Tgroup(6):group(7)" -RULENAME="time_r1c",EXTRACTION="(%reMonthNumber/%reDayNumber/%reYear4Digit) %reTimeHour:%reTimeMinute",NORM_VALUE="group(4)-group(2)-group(3)Tgroup(5):group(6)" -RULENAME="time_r1d",EXTRACTION="(%reMonthNumber/%reDayNumber/%reYear4Digit) %reTimeHour:%reTimeMinute:%reTimeMinute",NORM_VALUE="group(4)-group(2)-group(3)Tgroup(5):group(6):group(7)" -RULENAME="time_r1e",EXTRACTION="(%reMonthNumber/%reDayNumber/%reYear4Digit) %reTimeHour:%reTimeMinute:%reTimeMinute\.%reYear2Digit",NORM_VALUE="group(4)-group(2)-group(3)Tgroup(5):group(6):group(7).group(8)" +RULENAME="time_r1a",EXTRACTION="%reYear4Digit-%reMonthNumber-%reDayNumber[T ]%reTimeHour:%reTimeMinute:%reTimeMinute",NORM_VALUE="group(1)-group(2)-group(3)Tgroup(4):group(5):group(6)" +RULENAME="time_r1b",EXTRACTION="%reYear4Digit-%reMonthNumber-%reDayNumber[T ]%reTimeHour:%reTimeMinute",NORM_VALUE="group(1)-group(2)-group(3)Tgroup(4):group(5)" +RULENAME="time_r1c",EXTRACTION="%reMonthNumber/%reDayNumber/%reYear4Digit %reTimeHour:%reTimeMinute",NORM_VALUE="group(3)-group(1)-group(2)Tgroup(4):group(5)" +RULENAME="time_r1d",EXTRACTION="%reMonthNumber/%reDayNumber/%reYear4Digit %reTimeHour:%reTimeMinute:%reTimeMinute",NORM_VALUE="group(3)-group(1)-group(2)Tgroup(4):group(5):group(6)" +RULENAME="time_r1e",EXTRACTION="%reMonthNumber/%reDayNumber/%reYear4Digit %reTimeHour:%reTimeMinute:%reTimeMinute\.(\d\d)",NORM_VALUE="group(3)-group(1)-group(2)Tgroup(4):group(5):group(6).group(7)" ///////////////////////////// // PART-OF-DAY GRANULARITY // @@ -29,11 +29,10 @@ RULENAME="time_r1e",EXTRACTION="(%reMonthNumber/%reDayNumber/%reYear4Digit) %reT // EXAMPLE r2b_1: Monday night // EXAMPLE r2c_1: midnight today // EXAMPLE r2d_1: yesterday morning -RULENAME="time_r2a",EXTRACTION="%rePartOfDay %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(2))T%normPartOfDay(group(1))" -RULENAME="time_r2b1",EXTRACTION="%reWeekday %rePartOfDay",NORM_VALUE="UNDEF-day-%normWeekday(group(1))T%normPartOfDay(group(2))" -RULENAME="time_r2b2",EXTRACTION="%reWeekday%rePartOfDay",NORM_VALUE="UNDEF-day-%normWeekday(group(1))T%normPartOfDay(group(2))" -RULENAME="time_r2c",EXTRACTION="%rePartOfDay %reDateWord",NORM_VALUE="%normDateWord(group(2))T%normPartOfDay(group(1))" -RULENAME="time_r2d",EXTRACTION="%reDateWord %rePartOfDay",NORM_VALUE="%normDateWord(group(1))T%normPartOfDay(group(2))" +RULENAME="time_r2a",EXTRACTION="%rePartOfDay (?:des )?%reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(2))T%normPartOfDay(group(1))" +RULENAME="time_r2b",EXTRACTION="%reWeekday(?:des | |)%rePartOfDay",NORM_VALUE="UNDEF-day-%normWeekday(group(1))T%normPartOfDay(group(2))" +RULENAME="time_r2c",EXTRACTION="%rePartOfDay (?:des )?%reDateWord",NORM_VALUE="%normDateWord(group(2))T%normPartOfDay(group(1))" +RULENAME="time_r2d",EXTRACTION="%reDateWord (?:des )?%rePartOfDay",NORM_VALUE="%normDateWord(group(1))T%normPartOfDay(group(2))" /////////////////////////// // TIMEPOINT GRANULARITY // @@ -42,28 +41,28 @@ RULENAME="time_r2d",EXTRACTION="%reDateWord %rePartOfDay",NORM_VALUE="%normDateW // EXAMPLE r3a_1: 14:30 Uhr // EXAMPLE r3b_1: 14 Uhr 30 // EXAMPLE r3c_1: 15 Uhr -RULENAME="time_r3a",EXTRACTION="(%reApproximate )?%reTimeHour:%reTimeMinute( Uhr)?",NORM_VALUE="UNDEF-this-dayT%normDay(group(3)):group(4)" -RULENAME="time_r3b",EXTRACTION="(%reApproximate )?%reTimeHour (Uhr) %reTimeMinute",NORM_VALUE="UNDEF-this-dayT%normDay(group(3)):group(5)" -RULENAME="time_r3c",EXTRACTION="(%reApproximate )?%reTimeHour Uhr",NORM_VALUE="UNDEF-this-dayT%normDay(group(3)):00" +RULENAME="time_r3a",EXTRACTION="(?:%reApproximate |)%reTimeHour:%reTimeMinute(?: Uhr)?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):group(3)" +RULENAME="time_r3b",EXTRACTION="(?:%reApproximate |)%reTimeHour (?:Uhr) %reTimeMinute",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):group(3)" +RULENAME="time_r3c",EXTRACTION="(?:%reApproximate |)%reTimeHour Uhr",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):00" -RULENAME="time_r3d1",EXTRACTION="(%reApproximate )?%reTimeHourWord Uhr( morgens)?",NORM_VALUE="UNDEF-this-dayT%normDay(group(3)):00" -RULENAME="time_r3d2",EXTRACTION="(%reApproximate )?%reTimeHourWord Uhr (abends|nachmittags)",NORM_VALUE="UNDEF-this-dayT%SUM%(%normDay(group(3)),12):00" -RULENAME="time_r3d3",EXTRACTION="(%reApproximate )?%reTimeHourWordAll Uhr",NORM_VALUE="UNDEF-this-dayT%normDay(group(3)):00" +RULENAME="time_r3d1",EXTRACTION="(?:%reApproximate |)%reTimeHourWord Uhr(?: morgens)?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):00" +RULENAME="time_r3d2",EXTRACTION="(?:%reApproximate |)%reTimeHourWord Uhr (?:abends|nachmittags)",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(2)),12):00" +RULENAME="time_r3d3",EXTRACTION="(?:%reApproximate |)%reTimeHourWordAll Uhr",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):00" // time_r4 // EXAMPLE r4a_1: Morgen des 1. August 2000 // EXAMPLE r4b_1: Morgen des 1. August -RULENAME="time_r4a",EXTRACTION="(%reApproximate )?%rePartOfDay (des) (%reDayNumber)([\.]? |[\.])(%reMonthLong|%reMonthShort|%reMonthNumber[\.]?),? %reYear4Digit",NORM_VALUE="group(12)-%normMonth(group(8))-%normDay(group(5))T%normPartOfDay(group(3))" -RULENAME="time_r4b",EXTRACTION="(%reApproximate )?%rePartOfDay (des) (%reDayNumber)([\.]? |[\.])(%reMonthLong|%reMonthShort|%reMonthNumber[\.]?)",NORM_VALUE="UNDEF-year-%normMonth(group(8))-%normDay(group(5))T%normPartOfDay(group(3))" +RULENAME="time_r4a",EXTRACTION="(?:%reApproximate |)%rePartOfDay des %(reDayNumber|reDayNumberTh|reDayWordTh) %(reMonthLong|reMonthShort|reMonthNumber)\.?,? %reYear4Digit",NORM_VALUE="group(5)-%normMonth(group(4))-%normDay(group(3))T%normPartOfDay(group(2))" +RULENAME="time_r4b",EXTRACTION="(?:%reApproximate |)%rePartOfDay des %(reDayNumber|reDayNumberTh|reDayWordTh) %(reMonthLong|reMonthShort|reMonthNumber)",NORM_VALUE="UNDEF-year-%normMonth(group(4))-%normDay(group(3))T%normPartOfDay(group(2))" // time_r5 // EXAMPLE r5a-1: (am) Morgen // EXAMPLE r5b-1: nächsten Morgen // EXAMPLE r5c-1: (am) Morgen desselben Tages -RULENAME="time_r5a",EXTRACTION="(\b[Aa]m) %rePartOfDay",NORM_VALUE="UNDEF-this-dayT%normPartOfDay(group(2))",OFFSET="group(2)-group(2)" -RULENAME="time_r5b",EXTRACTION="%reThisNextLast %rePartOfDay",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-dayT%normPartOfDay(group(2))" -RULENAME="time_r5c",EXTRACTION="(de[rm]selben?) %rePartOfDay",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normPartOfDay(group(2))" -RULENAME="time_r5d",EXTRACTION="%rePartOfDay (desselben|dieses) (Tages)",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normPartOfDay(group(1))" +RULENAME="time_r5a",EXTRACTION="(?:[Aa]m) %rePartOfDay",NORM_VALUE="UNDEF-this-dayT%normPartOfDay(group(1))",OFFSET="group(1)-group(1)" +RULENAME="time_r5b",EXTRACTION="(?:[Aa]m) %reThisNextLast %rePartOfDay",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-dayT%normPartOfDay(group(2))" +RULENAME="time_r5c",EXTRACTION="(?:de[rm]selben?) %rePartOfDay",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normPartOfDay(group(1))" +RULENAME="time_r5d",EXTRACTION="(?:[Aa]m) %rePartOfDay (?:desselben|dieses) Tages",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normPartOfDay(group(1))" diff --git a/src/de/unihd/dbs/heideltime/standalone/CLISwitch.java b/src/de/unihd/dbs/heideltime/standalone/CLISwitch.java index 145482be..98baa1de 100644 --- a/src/de/unihd/dbs/heideltime/standalone/CLISwitch.java +++ b/src/de/unihd/dbs/heideltime/standalone/CLISwitch.java @@ -6,6 +6,7 @@ import java.util.Date; +import de.unihd.dbs.uima.annotator.heideltime.DocumentType; import de.unihd.dbs.uima.annotator.heideltime.resources.Language; /** @@ -18,7 +19,7 @@ public enum CLISwitch { ENCODING ("Encoding to use", "-e", "UTF-8"), OUTPUTTYPE ("Output Type output type to use", "-o", OutputType.TIMEML), LANGUAGE ("Language to use", "-l", Language.ENGLISH.toString()), - DOCTYPE ("Document Type/Domain to use", "-t", DocumentType.NARRATIVES), + DOCTYPE ("Document Type/Domain to use", "-t", DocumentType.NARRATIVE), DCT ("Document Creation Time. Format: YYYY-mm-dd.", "-dct", new Date()), CONFIGFILE ("Configuration file path", "-c", "config.props"), LOCALE ("Locale", "-locale", null), diff --git a/src/de/unihd/dbs/heideltime/standalone/DocumentType.java b/src/de/unihd/dbs/heideltime/standalone/DocumentType.java index 4ca4baad..91ad03e5 100644 --- a/src/de/unihd/dbs/heideltime/standalone/DocumentType.java +++ b/src/de/unihd/dbs/heideltime/standalone/DocumentType.java @@ -1,44 +1,28 @@ -/* - * DocumentType.java - * - * Copyright (c) 2011, Database Research Group, Institute of Computer Science, University of Heidelberg. - * All rights reserved. This program and the accompanying materials - * are made available under the terms of the GNU General Public License. - * - * authors: Andreas Fay, Jannik Strötgen - * email: fay@stud.uni-heidelberg.de, stroetgen@uni-hd.de - * - * HeidelTime is a multilingual, cross-domain temporal tagger. - * For details, see http://dbs.ifi.uni-heidelberg.de/heideltime - */ - -package de.unihd.dbs.heideltime.standalone; - -/** - * Type of document to be processed by HeidelTime - * - * @author Andreas Fay, University of Heidelberg - * @version 1.0 - */ -public enum DocumentType { - NARRATIVES { - public String toString() { - return "narratives"; - } - }, - NEWS { - public String toString() { - return "news"; - } - }, - COLLOQUIAL { - public String toString() { - return "colloquial"; - } - }, - SCIENTIFIC { - public String toString() { - return "scientific"; - } - } -} +package de.unihd.dbs.heideltime.standalone; + +/** + * Legacy constants to transition to the enum at {@link de.unihd.dbs.uima.annotator.heideltime.DocumentType}. + * + * Because we cannot subclass enums, this will not be binary compatible, + * but at least we get compile time compatibility. + * + * @author Erich Schubert + */ +@Deprecated +public final class DocumentType { + /** Use {@link de.unihd.dbs.uima.annotator.heideltime.DocumentType.NARRATIVE} instead. */ + @Deprecated + public static final de.unihd.dbs.uima.annotator.heideltime.DocumentType NARRATIVES = de.unihd.dbs.uima.annotator.heideltime.DocumentType.NARRATIVE; + + /** Use {@link de.unihd.dbs.uima.annotator.heideltime.DocumentType.NEWS} instead. */ + @Deprecated + public static final de.unihd.dbs.uima.annotator.heideltime.DocumentType NEWS = de.unihd.dbs.uima.annotator.heideltime.DocumentType.NEWS; + + /** Use {@link de.unihd.dbs.uima.annotator.heideltime.DocumentType.COLLOQUIAL} instead. */ + @Deprecated + public static final de.unihd.dbs.uima.annotator.heideltime.DocumentType COLLOQUIAL = de.unihd.dbs.uima.annotator.heideltime.DocumentType.COLLOQUIAL; + + /** Use {@link de.unihd.dbs.uima.annotator.heideltime.DocumentType.SCIENTIFIC} instead. */ + @Deprecated + public static final de.unihd.dbs.uima.annotator.heideltime.DocumentType SCIENTIFIC = de.unihd.dbs.uima.annotator.heideltime.DocumentType.SCIENTIFIC; +} diff --git a/src/de/unihd/dbs/heideltime/standalone/HeidelTimeStandalone.java b/src/de/unihd/dbs/heideltime/standalone/HeidelTimeStandalone.java index 79d7043f..b9947267 100644 --- a/src/de/unihd/dbs/heideltime/standalone/HeidelTimeStandalone.java +++ b/src/de/unihd/dbs/heideltime/standalone/HeidelTimeStandalone.java @@ -29,13 +29,13 @@ import java.util.Date; import java.util.Locale; import java.util.Properties; -import java.util.logging.Level; -import java.util.logging.Logger; import org.apache.uima.UIMAFramework; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.metadata.TypeSystemDescription; import org.apache.uima.util.XMLInputSource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import de.unihd.dbs.heideltime.standalone.components.JCasFactory; import de.unihd.dbs.heideltime.standalone.components.ResultFormatter; @@ -51,6 +51,7 @@ import de.unihd.dbs.heideltime.standalone.components.impl.UimaContextImpl; import de.unihd.dbs.heideltime.standalone.components.impl.XMIResultFormatter; import de.unihd.dbs.heideltime.standalone.exceptions.DocumentCreationTimeMissingException; +import de.unihd.dbs.uima.annotator.heideltime.DocumentType; import de.unihd.dbs.uima.annotator.heideltime.HeidelTime; import de.unihd.dbs.uima.annotator.heideltime.resources.Language; import de.unihd.dbs.uima.annotator.heideltime.resources.ResourceScanner; @@ -65,6 +66,11 @@ */ public class HeidelTimeStandalone { + /** + * Logging engine + */ + private static final Logger LOG = LoggerFactory.getLogger(HeidelTimeStandalone.class); + /** * Used document type */ @@ -100,12 +106,6 @@ public class HeidelTimeStandalone { */ private Boolean doIntervalTagging; - /** - * Logging engine - */ - private static Logger logger = Logger.getLogger("HeidelTimeStandalone"); - - /** * empty constructor. * @@ -215,7 +215,7 @@ public void initialize(Language language, DocumentType typeToProcess, OutputType * @param doIntervalTagging Whether or not to invoke the IntervalTagger */ public void initialize(Language language, DocumentType typeToProcess, OutputType outputType, String configPath, POSTagger posTagger, Boolean doIntervalTagging) { - logger.log(Level.INFO, "HeidelTimeStandalone initialized with language " + this.language.getName()); + LOG.info("HeidelTimeStandalone initialized with language {}", language.getName()); // set the POS tagger this.posTagger = posTagger; @@ -234,14 +234,13 @@ public void initialize(Language language, DocumentType typeToProcess, OutputType try { heidelTime = new HeidelTime(); heidelTime.initialize(new UimaContextImpl(language, typeToProcess, CLISwitch.VERBOSITY2.getIsActive())); - logger.log(Level.INFO, "HeidelTime initialized"); + LOG.info("HeidelTime initialized"); } catch (Exception e) { - e.printStackTrace(); - logger.log(Level.WARNING, "HeidelTime could not be initialized"); + LOG.warn("HeidelTime could not be initialized", e); } // Initialize JCas factory ------------- - logger.log(Level.FINE, "Initializing JCas factory..."); + LOG.debug("Initializing JCas factory..."); try { TypeSystemDescription[] descriptions = new TypeSystemDescription[] { UIMAFramework @@ -253,10 +252,9 @@ public void initialize(Language language, DocumentType typeToProcess, OutputType .getResource( Config.get(Config.TYPESYSTEMHOME)))) }; jcasFactory = new JCasFactoryImpl(descriptions); - logger.log(Level.INFO, "JCas factory initialized"); + LOG.info("JCas factory initialized"); } catch (Exception e) { - e.printStackTrace(); - logger.log(Level.WARNING, "JCas factory could not be initialized"); + LOG.warn("JCas factory could not be initialized", e); } } @@ -265,7 +263,7 @@ public void initialize(Language language, DocumentType typeToProcess, OutputType * @param jcas jcas object */ private void runIntervalTagger(JCas jcas) { - logger.log(Level.FINEST, "Running Interval Tagger..."); + LOG.debug("Running Interval Tagger..."); Integer beforeAnnotations = jcas.getAnnotationIndex().size(); // Prepare the options for IntervalTagger's execution @@ -280,8 +278,8 @@ private void runIntervalTagger(JCas jcas) { iTagger.process(jcas); // debug output - Integer afterAnnotations = jcas.getAnnotationIndex().size(); - logger.log(Level.FINEST, "Annotation delta: " + (afterAnnotations - beforeAnnotations)); + int afterAnnotations = jcas.getAnnotationIndex().size(); + LOG.debug("Annotation delta: {}", afterAnnotations - beforeAnnotations); } /** @@ -338,7 +336,7 @@ private void establishHeidelTimePreconditions(JCas jcas) { * @param jcas */ private void establishPartOfSpeechInformation(JCas jcas) { - logger.log(Level.FINEST, "Establishing part of speech information..."); + LOG.debug("Establishing part of speech information..."); PartOfSpeechTagger partOfSpeechTagger = null; Properties settings = new Properties(); @@ -346,7 +344,7 @@ private void establishPartOfSpeechInformation(JCas jcas) { case ARABIC: if(POSTagger.NO.equals(posTagger)) { partOfSpeechTagger = new AllLanguagesTokenizerWrapper(); - logger.log(Level.INFO, "Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for Arabic. " + LOG.info("Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for Arabic. " + "Thus, tagging results might be very different (and worse)."); } else { partOfSpeechTagger = new StanfordPOSTaggerWrapper(); @@ -360,7 +358,7 @@ private void establishPartOfSpeechInformation(JCas jcas) { case VIETNAMESE: if(POSTagger.NO.equals(posTagger)) { partOfSpeechTagger = new AllLanguagesTokenizerWrapper(); - logger.log(Level.INFO, "Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for Vietnamese. " + LOG.info("Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for Vietnamese. " + "Thus, tagging results might be very different (and worse)."); } else { partOfSpeechTagger = new JVnTextProWrapper(); @@ -375,7 +373,7 @@ private void establishPartOfSpeechInformation(JCas jcas) { case CROATIAN: if(POSTagger.NO.equals(posTagger)) { partOfSpeechTagger = new AllLanguagesTokenizerWrapper(); - logger.log(Level.INFO, "Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for Croatian. " + LOG.info("Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for Croatian. " + "Thus, tagging results might be very different (and worse)."); } else { partOfSpeechTagger = new HunPosTaggerWrapper(); @@ -411,18 +409,18 @@ private void establishPartOfSpeechInformation(JCas jcas) { settings.put(PartOfSpeechTagger.HUNPOS_MODEL_PATH, Config.get(Config.HUNPOS_MODEL_PATH)); } else if(POSTagger.NO.equals(posTagger)) { partOfSpeechTagger = new AllLanguagesTokenizerWrapper(); - logger.log(Level.INFO, "Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for the selected language. " + LOG.info("Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for the selected language. " + "If proper preprocessing for the specified language (." + language.getName() + ") is available, this might results in better " + "temporal tagging quality."); } else { - logger.log(Level.FINEST, "Sorry, but you can't use that tagger."); + LOG.warn("Sorry, but you can't use that tagger."); } } partOfSpeechTagger.initialize(settings); partOfSpeechTagger.process(jcas); partOfSpeechTagger.reset(); - logger.log(Level.FINEST, "Part of speech information established"); + LOG.trace("Part of speech information established"); } private ResultFormatter getFormatter() { @@ -495,33 +493,31 @@ public String process(String document, ResultFormatter resultFormatter) */ public String process(String document, Date documentCreationTime, ResultFormatter resultFormatter) throws DocumentCreationTimeMissingException { - logger.log(Level.INFO, "Processing started"); + LOG.info("Processing started"); // Generate jcas object ---------- - logger.log(Level.FINE, "Generate CAS object"); + LOG.debug("Generate CAS object"); JCas jcas = null; try { jcas = jcasFactory.createJCas(); jcas.setDocumentText(document); - logger.log(Level.FINE, "CAS object generated"); + LOG.debug("CAS object generated"); } catch (Exception e) { - e.printStackTrace(); - logger.log(Level.WARNING, "Cas object could not be generated"); + LOG.warn("Cas object could not be generated", e); } // Process jcas object ----------- try { - logger.log(Level.FINER, "Establishing preconditions..."); + LOG.trace("Establishing preconditions..."); provideDocumentCreationTime(jcas, documentCreationTime); establishHeidelTimePreconditions(jcas); - logger.log(Level.FINER, "Preconditions established"); + LOG.trace("Preconditions established"); heidelTime.process(jcas); - logger.log(Level.INFO, "Processing finished"); + LOG.info("Processing finished"); } catch (Exception e) { - e.printStackTrace(); - logger.log(Level.WARNING, "Processing aborted due to errors"); + LOG.warn("Processing aborted due to errors", e); } // process interval tagging --- @@ -529,15 +525,14 @@ public String process(String document, Date documentCreationTime, ResultFormatte runIntervalTagger(jcas); // Process results --------------- - logger.log(Level.FINE, "Formatting result..."); + LOG.debug("Formatting result..."); // PrintAnnotations.printAnnotations(jcas.getCas(), System.out); String result = null; try { result = resultFormatter.format(jcas); - logger.log(Level.INFO, "Result formatted"); + LOG.info("Result formatted"); } catch (Exception e) { - e.printStackTrace(); - logger.log(Level.WARNING, "Result could not be formatted"); + LOG.warn("Result could not be formatted", e); } return result; @@ -553,16 +548,16 @@ public static void main(String[] args) { // get the relevant enum CLISwitch sw = CLISwitch.getEnumFromSwitch(args[i]); if(sw == null) { // unsupported CLI switch - logger.log(Level.WARNING, "Unsupported switch: "+args[i]+". Quitting."); - System.exit(-1); + LOG.warn("Unsupported switch: "+args[i]+". Quitting."); + System.exit(1); } if(sw.getHasFollowingValue()) { // handle values for switches if(args.length > i+1 && !args[i+1].startsWith("-")) { // we still have an array index after this one and it's not a switch sw.setValue(args[++i]); } else { // value is missing or malformed - logger.log(Level.WARNING, "Invalid or missing parameter after "+args[i]+". Quitting."); - System.exit(-1); + LOG.warn("Invalid or missing parameter after "+args[i]+". Quitting."); + System.exit(1); } } else { // activate the value-less switches sw.setValue(null); @@ -582,43 +577,43 @@ public static void main(String[] args) { // start off with the verbosity recognition -- lots of the other // stuff can be skipped if this is set too high if(CLISwitch.VERBOSITY2.getIsActive()) { - logger.setLevel(Level.ALL); - logger.log(Level.INFO, "Verbosity: '-vv'; Logging level set to ALL."); + // FIXME: not available in slf4j facade. LOG.setLevel(Level.ALL); + LOG.info("Verbosity: '-vv'; Logging level set to ALL."); // output the found language resource folders String languagesList = ""; for(String language : ResourceScanner.getInstance().getDetectedResourceFolders()) { languagesList += System.getProperty("line.separator") + "- " + language; } - logger.log(Level.INFO, "Listing detected language folders:" + languagesList); + LOG.info("Listing detected language folders:" + languagesList); } else if(CLISwitch.VERBOSITY.getIsActive()) { - logger.setLevel(Level.INFO); - logger.log(Level.INFO, "Verbosity: '-v'; Logging level set to INFO and above."); + // FIXME: not available in slf4j facade. LOG.setLevel(Level.INFO); + LOG.info("Verbosity: '-v'; Logging level set to INFO and above."); } else { - logger.setLevel(Level.WARNING); - logger.log(Level.INFO, "Verbosity -v/-vv NOT FOUND OR RECOGNIZED; Logging level set to WARNING and above."); + // FIXME: not available in slf4j facade. LOG.setLevel(Level.WARNING); + LOG.info("Verbosity -v/-vv NOT FOUND OR RECOGNIZED; Logging level set to WARNING and above."); } // Check input encoding String encodingType = null; if(CLISwitch.ENCODING.getIsActive()) { encodingType = CLISwitch.ENCODING.getValue().toString(); - logger.log(Level.INFO, "Encoding '-e': "+encodingType); + LOG.info("Encoding '-e': "+encodingType); } else { // Encoding type not found encodingType = CLISwitch.ENCODING.getValue().toString(); - logger.log(Level.INFO, "Encoding '-e': NOT FOUND OR RECOGNIZED; set to 'UTF-8'"); + LOG.info("Encoding '-e': NOT FOUND OR RECOGNIZED; set to 'UTF-8'"); } // Check output format OutputType outputType = null; if(CLISwitch.OUTPUTTYPE.getIsActive()) { outputType = OutputType.valueOf(CLISwitch.OUTPUTTYPE.getValue().toString().toUpperCase()); - logger.log(Level.INFO, "Output '-o': "+outputType.toString().toUpperCase()); + LOG.info("Output '-o': "+outputType.toString().toUpperCase()); } else { // Output type not found outputType = (OutputType) CLISwitch.OUTPUTTYPE.getValue(); - logger.log(Level.INFO, "Output '-o': NOT FOUND OR RECOGNIZED; set to "+outputType.toString().toUpperCase()); + LOG.info("Output '-o': NOT FOUND OR RECOGNIZED; set to "+outputType.toString().toUpperCase()); } // Check language @@ -627,16 +622,16 @@ public static void main(String[] args) { language = Language.getLanguageFromString((String) CLISwitch.LANGUAGE.getValue()); if(language == Language.WILDCARD && !ResourceScanner.getInstance().getDetectedResourceFolders().contains(language.getName())) { - logger.log(Level.SEVERE, "Language '-l': "+CLISwitch.LANGUAGE.getValue()+" NOT RECOGNIZED; aborting."); + LOG.error("Language '-l': {} NOT RECOGNIZED; aborting.", CLISwitch.LANGUAGE.getValue()); printHelp(); - System.exit(-1); + System.exit(1); } else { - logger.log(Level.INFO, "Language '-l': "+language.getName()); + LOG.info("Language '-l': "+language.getName()); } } else { // Language not found language = Language.getLanguageFromString((String) CLISwitch.LANGUAGE.getValue()); - logger.log(Level.INFO, "Language '-l': NOT FOUND; set to "+language.toString().toUpperCase()); + LOG.info("Language '-l': NOT FOUND; set to {}", language.toString().toUpperCase()); } // Check type @@ -648,14 +643,14 @@ public static void main(String[] args) { } type = DocumentType.valueOf(CLISwitch.DOCTYPE.getValue().toString().toUpperCase()); } catch(IllegalArgumentException e) { - logger.log(Level.WARNING, "Type '-t': NOT RECOGNIZED. These are the available options: " + Arrays.asList(DocumentType.values())); - System.exit(-1); + LOG.warn("Type '-t': NOT RECOGNIZED. These are the available options: {}", Arrays.asList(DocumentType.values())); + System.exit(1); } - logger.log(Level.INFO, "Type '-t': "+type.toString().toUpperCase()); + LOG.info("Type '-t': "+type.toString().toUpperCase()); } else { // Type not found type = (DocumentType) CLISwitch.DOCTYPE.getValue(); - logger.log(Level.INFO, "Type '-t': NOT FOUND; set to "+type.toString().toUpperCase()); + LOG.info("Type '-t': NOT FOUND; set to {}", type.toString().toUpperCase()); } // Check document creation time @@ -664,21 +659,20 @@ public static void main(String[] args) { try { DateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); dct = formatter.parse(CLISwitch.DCT.getValue().toString()); - logger.log(Level.INFO, "Document Creation Time '-dct': "+dct.toString()); + LOG.info("Document Creation Time '-dct': {}", dct.toString()); } catch (Exception e) { // DCT was not parseable - logger.log(Level.WARNING, "Document Creation Time '-dct': NOT RECOGNIZED. Quitting."); + LOG.warn("Document Creation Time '-dct': NOT RECOGNIZED. Quitting."); printHelp(); - System.exit(-1); + System.exit(1); } } else { if ((type == DocumentType.NEWS) || (type == DocumentType.COLLOQUIAL)) { // Dct needed dct = (Date) CLISwitch.DCT.getValue(); - logger.log(Level.INFO, "Document Creation Time '-dct': NOT FOUND; set to local date (" - + dct.toString() + ")."); + LOG.info("Document Creation Time '-dct': NOT FOUND; set to local date ({}).", dct.toString()); } else { - logger.log(Level.INFO, "Document Creation Time '-dct': NOT FOUND; skipping."); + LOG.info("Document Creation Time '-dct': NOT FOUND; skipping."); } } @@ -694,32 +688,31 @@ public static void main(String[] args) { try { Locale.setDefault(myLocale); // try to set the locale - logger.log(Level.INFO, "Locale '-locale': "+myLocale.toString()); + LOG.info("Locale '-locale': "+myLocale.toString()); } catch(Exception e) { // if the above fails, spit out error message and available locales - logger.log(Level.WARNING, "Supplied locale parameter couldn't be resolved to a working locale. Try one of these:"); - logger.log(Level.WARNING, Arrays.asList(Locale.getAvailableLocales()).toString()); // list available locales + LOG.warn("Supplied locale parameter couldn't be resolved to a working locale. Try one of these:"); + LOG.warn(Arrays.asList(Locale.getAvailableLocales()).toString()); // list available locales printHelp(); - System.exit(-1); + System.exit(1); } } else { // no -locale parameter supplied: just show default locale - logger.log(Level.INFO, "Locale '-locale': NOT FOUND, set to environment locale: "+Locale.getDefault().toString()); + LOG.info("Locale '-locale': NOT FOUND, set to environment locale: {}", Locale.getDefault().toString()); } // Read configuration from file String configPath = CLISwitch.CONFIGFILE.getValue().toString(); try { - logger.log(Level.INFO, "Configuration path '-c': "+configPath); + LOG.info("Configuration path '-c': "+configPath); readConfigFile(configPath); - logger.log(Level.FINE, "Config initialized"); + LOG.debug("Config initialized"); } catch (Exception e) { - e.printStackTrace(); - logger.log(Level.WARNING, "Config could not be initialized! Please supply the -c switch or " - + "put a config.props into this directory."); + LOG.warn("Config could not be initialized! Please supply the -c switch or " + + "put a config.props into this directory.", e); printHelp(); - System.exit(-1); + System.exit(1); } // Set the preprocessing POS tagger @@ -728,31 +721,31 @@ public static void main(String[] args) { try { posTagger = POSTagger.valueOf(CLISwitch.POSTAGGER.getValue().toString().toUpperCase()); } catch(IllegalArgumentException e) { - logger.log(Level.WARNING, "Given POS Tagger doesn't exist. Please specify a valid one as listed in the help."); + LOG.warn("Given POS Tagger doesn't exist. Please specify a valid one as listed in the help."); printHelp(); - System.exit(-1); + System.exit(1); } - logger.log(Level.INFO, "POS Tagger '-pos': "+posTagger.toString().toUpperCase()); + LOG.info("POS Tagger '-pos': {}", posTagger.toString().toUpperCase()); } else { // Type not found posTagger = (POSTagger) CLISwitch.POSTAGGER.getValue(); - logger.log(Level.INFO, "POS Tagger '-pos': NOT FOUND OR RECOGNIZED; set to "+posTagger.toString().toUpperCase()); + LOG.info("POS Tagger '-pos': NOT FOUND OR RECOGNIZED; set to {}", posTagger.toString().toUpperCase()); } // Set whether or not to use the Interval Tagger Boolean doIntervalTagging = false; if(CLISwitch.INTERVALS.getIsActive()) { doIntervalTagging = CLISwitch.INTERVALS.getIsActive(); - logger.log(Level.INFO, "Interval Tagger '-it': " + doIntervalTagging.toString()); + LOG.info("Interval Tagger '-it': {}", doIntervalTagging.toString()); } else { - logger.log(Level.INFO, "Interval Tagger '-it': NOT FOUND OR RECOGNIZED; set to " + doIntervalTagging.toString()); + LOG.info("Interval Tagger '-it': NOT FOUND OR RECOGNIZED; set to {}", doIntervalTagging.toString()); } // make sure we have a document path if (docPath == null) { - logger.log(Level.WARNING, "No input file given; aborting."); + LOG.warn("No input file given; aborting."); printHelp(); - System.exit(-1); + System.exit(1); } @@ -763,7 +756,7 @@ public static void main(String[] args) { FileChannel inChannel = null; PrintWriter pwOut = null; try { - logger.log(Level.INFO, "Reading document using charset: " + encodingType); + LOG.info("Reading document using charset: " + encodingType); aFile = new RandomAccessFile(docPath, "r"); inChannel = aFile.getChannel(); @@ -785,7 +778,7 @@ public static void main(String[] args) { pwOut = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8")); pwOut.println(out); } catch (Exception e) { - e.printStackTrace(); + LOG.warn(e.getMessage(), e); } finally { if(pwOut != null) { pwOut.close(); @@ -809,7 +802,7 @@ public static void main(String[] args) { public static void readConfigFile(String configPath) { InputStream configStream = null; try { - logger.log(Level.INFO, "trying to read in file "+configPath); + LOG.info("trying to read in file "+configPath); configStream = new FileInputStream(configPath); Properties props = new Properties(); @@ -819,11 +812,10 @@ public static void readConfigFile(String configPath) { configStream.close(); } catch (FileNotFoundException e) { - logger.log(Level.WARNING, "couldn't open configuration file \""+configPath+"\". quitting."); - System.exit(-1); + LOG.error("couldn't open configuration file \"{}\". quitting.", configPath); + throw new RuntimeException("Cannot read HeidelTime configuration."); } catch (IOException e) { - logger.log(Level.WARNING, "couldn't close config file handle"); - e.printStackTrace(); + LOG.warn("couldn't close config file handle", e); } } diff --git a/src/de/unihd/dbs/heideltime/standalone/components/impl/StandaloneConfigContext.java b/src/de/unihd/dbs/heideltime/standalone/components/impl/StandaloneConfigContext.java index e6edaaa2..a83933fa 100644 --- a/src/de/unihd/dbs/heideltime/standalone/components/impl/StandaloneConfigContext.java +++ b/src/de/unihd/dbs/heideltime/standalone/components/impl/StandaloneConfigContext.java @@ -1,172 +1,29 @@ package de.unihd.dbs.heideltime.standalone.components.impl; -import java.io.InputStream; -import java.net.URI; -import java.net.URL; -import java.util.HashMap; +import org.apache.uima.impl.RootUimaContext_impl; +import org.apache.uima.resource.ConfigurationManager; +import org.apache.uima.resource.impl.ConfigurationManager_impl; +import org.apache.uima.resource.impl.ResourceManager_impl; -import org.apache.uima.UimaContext; -import org.apache.uima.cas.AbstractCas; -import org.apache.uima.cas.SofaID; -import org.apache.uima.resource.ResourceAccessException; -import org.apache.uima.resource.Session; -import org.apache.uima.util.InstrumentationFacility; -import org.apache.uima.util.Logger; +/** + * UIMA context with manually set configuration manager. + */ +public class StandaloneConfigContext extends RootUimaContext_impl { + private ConfigurationManager mConfigManager; -@SuppressWarnings("deprecation") -public class StandaloneConfigContext implements UimaContext { - private HashMap settings = new HashMap(); - - @Override - public Object getConfigParameterValue(String aParamName) { - return settings.get(aParamName); + public StandaloneConfigContext() { + super(); + mConfigManager = new ConfigurationManager_impl(); + this.initializeRoot(null, new ResourceManager_impl(), mConfigManager); + mConfigManager.setSession(this.getSession()); } - public void setConfigParameterValue(String aParamName, Object aParamValue) { - settings.put(aParamName, aParamValue); - } - - @Override - public Object getConfigParameterValue(String aGroupName, String aParamName) { - return settings.get(aParamName); + public void setConfigParameterValue(String key, Object val) { + mConfigManager.setConfigParameterValue(makeQualifiedName(key), val); } - /* - * leave these defunct because we don't use them for now - */ - - @Override - public String[] getConfigurationGroupNames() { - // TODO Auto-generated method stub - return null; - } - - @Override - public String[] getConfigParameterNames(String aGroup) { - // TODO Auto-generated method stub - return null; - } - - @Override - public String[] getConfigParameterNames() { - // TODO Auto-generated method stub - return null; - } - - @Override - public Logger getLogger() { - // TODO Auto-generated method stub - return null; - } - @Override - public InstrumentationFacility getInstrumentationFacility() { - // TODO Auto-generated method stub - return null; + public ConfigurationManager getConfigurationManager() { + return mConfigManager; } - - @Override - public URL getResourceURL(String aKey) throws ResourceAccessException { - // TODO Auto-generated method stub - return null; - } - - @Override - public URI getResourceURI(String aKey) throws ResourceAccessException { - // TODO Auto-generated method stub - return null; - } - - @Override - public String getResourceFilePath(String aKey) - throws ResourceAccessException { - // TODO Auto-generated method stub - return null; - } - - @Override - public InputStream getResourceAsStream(String aKey) - throws ResourceAccessException { - // TODO Auto-generated method stub - return null; - } - - @Override - public Object getResourceObject(String aKey) throws ResourceAccessException { - // TODO Auto-generated method stub - return null; - } - - @Override - public URL getResourceURL(String aKey, String[] aParams) - throws ResourceAccessException { - // TODO Auto-generated method stub - return null; - } - - @Override - public URI getResourceURI(String aKey, String[] aParams) - throws ResourceAccessException { - // TODO Auto-generated method stub - return null; - } - - @Override - public String getResourceFilePath(String aKey, String[] aParams) - throws ResourceAccessException { - // TODO Auto-generated method stub - return null; - } - - @Override - public InputStream getResourceAsStream(String aKey, String[] aParams) - throws ResourceAccessException { - // TODO Auto-generated method stub - return null; - } - - @Override - public Object getResourceObject(String aKey, String[] aParams) - throws ResourceAccessException { - // TODO Auto-generated method stub - return null; - } - - @Override - public String getDataPath() { - // TODO Auto-generated method stub - return null; - } - - @Override - public Session getSession() { - // TODO Auto-generated method stub - return null; - } - - @Override - public SofaID mapToSofaID(String aSofaName) { - // TODO Auto-generated method stub - return null; - } - - @Override - public String mapSofaIDToComponentSofaName(String aSofaID) { - // TODO Auto-generated method stub - return null; - } - - @Override - public SofaID[] getSofaMappings() { - // TODO Auto-generated method stub - return null; - } - - @Override - @SuppressWarnings("rawtypes") - public AbstractCas getEmptyCas(Class aCasInterface) { - // TODO Auto-generated method stub - return null; - } - } diff --git a/src/de/unihd/dbs/heideltime/standalone/components/impl/UimaContextImpl.java b/src/de/unihd/dbs/heideltime/standalone/components/impl/UimaContextImpl.java index 1cd77f5c..72c4c947 100644 --- a/src/de/unihd/dbs/heideltime/standalone/components/impl/UimaContextImpl.java +++ b/src/de/unihd/dbs/heideltime/standalone/components/impl/UimaContextImpl.java @@ -20,7 +20,7 @@ import org.apache.uima.resource.impl.ResourceManager_impl; import de.unihd.dbs.heideltime.standalone.Config; -import de.unihd.dbs.heideltime.standalone.DocumentType; +import de.unihd.dbs.uima.annotator.heideltime.DocumentType; import de.unihd.dbs.uima.annotator.heideltime.resources.Language; /** diff --git a/src/de/unihd/dbs/heideltime/standalone/exceptions/DocumentCreationTimeMissingException.java b/src/de/unihd/dbs/heideltime/standalone/exceptions/DocumentCreationTimeMissingException.java index 8e2b3449..0230cadd 100644 --- a/src/de/unihd/dbs/heideltime/standalone/exceptions/DocumentCreationTimeMissingException.java +++ b/src/de/unihd/dbs/heideltime/standalone/exceptions/DocumentCreationTimeMissingException.java @@ -10,12 +10,10 @@ * * HeidelTime is a multilingual, cross-domain temporal tagger. * For details, see http://dbs.ifi.uni-heidelberg.de/heideltime - */ + */ package de.unihd.dbs.heideltime.standalone.exceptions; -import de.unihd.dbs.heideltime.standalone.DocumentType; - /** * Exception thrown if document creation time is missing while processing a document of type {@link DocumentType#NEWS} * @@ -23,10 +21,8 @@ * @version 1.0 */ public class DocumentCreationTimeMissingException extends Exception { - /** * */ private static final long serialVersionUID = -157033697488394828L; - } diff --git a/src/de/unihd/dbs/uima/annotator/alllanguagestokenizer/AllLanguagesTokenizer.java b/src/de/unihd/dbs/uima/annotator/alllanguagestokenizer/AllLanguagesTokenizer.java index e312c2c8..5ddbf61e 100644 --- a/src/de/unihd/dbs/uima/annotator/alllanguagestokenizer/AllLanguagesTokenizer.java +++ b/src/de/unihd/dbs/uima/annotator/alllanguagestokenizer/AllLanguagesTokenizer.java @@ -8,6 +8,7 @@ import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.jcas.JCas; import de.unihd.dbs.uima.types.heideltime.Sentence; @@ -60,135 +61,127 @@ public List tokenize(JCas jcas) { if(line.matches("^<.*>$")) { // SGML tag outBuf.append(line + "\n"); - } else { - // add a blank at the beginning and the end of each segment - line = " " + line + " "; - - // insert missing blanks after punctuation - line = line.replaceAll("\\.\\.\\.", " ... "); - line = line.replaceAll("([;\\!\\?])([^ ])", "$1 $2"); - line = line.replaceAll("([.,:])([^ 0-9.])", "$1 $2"); - - String[] lines = line.split(" "); - - for(String token : lines) { - // remove some whitespaces that \s doesn't catch - if(token.equals("")) - continue; - - String suffix = ""; - - // separate punctuation and parentheses from words - Boolean finished = false; - Matcher m; - do { - finished = true; - - // cut off preceding punctuation - m = Pattern.compile("^([" + PChar + "])(.)").matcher(token); - if(m.find()) { - token = token.replaceAll("^([" + PChar + "])(.)", "$2"); - outBuf.append(m.group(1) + "\n"); - finished = false; - } - - // cut off trailing punctuation - m = Pattern.compile("(.)([" + FChar + "])$").matcher(token); - if(m.find()) { - token = token.replaceAll("(.)([" + FChar + "])$", "$1"); - suffix = m.group(2) + "\n" + suffix; - finished = false; - } - - // cut off trailing periods if punctuation precedes - m = Pattern.compile("([" + FChar + "])\\.$").matcher(token); - if(m.find()) { - token = token.replaceAll("([" + FChar + "])\\.$", ""); - suffix = ".\n" + suffix; - - if(token.equals("")) { - token = m.group(1); - } else { - suffix = m.group(1) + "\n" + suffix; - } - - finished = false; + continue; + } + // add a blank at the beginning and the end of each segment + line = " " + line + " "; + + // insert missing blanks after punctuation + line = line.replaceAll("\\.\\.\\.", " ... "); + line = line.replaceAll("([;\\!\\?])([^ ])", "$1 $2"); + line = line.replaceAll("([.,:])([^ 0-9.])", "$1 $2"); + + String[] lines = line.split(" "); + + for(String token : lines) { + // remove some whitespaces that \s doesn't catch + if(token.equals("")) + continue; + + String suffix = ""; + + // separate punctuation and parentheses from words + boolean finished = false; + Matcher m; + do { + finished = true; + + // cut off preceding punctuation + m = Pattern.compile("^([" + PChar + "])(.)").matcher(token); + if(m.find()) { + token = token.replaceAll("^([" + PChar + "])(.)", "$2"); + outBuf.append(m.group(1) + "\n"); + finished = false; + } + + // cut off trailing punctuation + m = Pattern.compile("(.)([" + FChar + "])$").matcher(token); + if(m.find()) { + token = token.replaceAll("(.)([" + FChar + "])$", "$1"); + suffix = m.group(2) + "\n" + suffix; + finished = false; + } + + // cut off trailing periods if punctuation precedes + m = Pattern.compile("([" + FChar + "])\\.$").matcher(token); + if(m.find()) { + token = token.replaceAll("([" + FChar + "])\\.$", ""); + suffix = ".\n" + suffix; + + if(token.equals("")) { + token = m.group(1); + } else { + suffix = m.group(1) + "\n" + suffix; } - } while(!finished); - /* TODO:commented out because those are language-specific + + finished = false; + } + } while(!finished); + /* TODO:commented out because those are language-specific // handle explicitly listed tokens if(abbreviations.contains(token)) { outBuf.append(token + "\n" + suffix); continue; }*/ - - // abbreviations of the form A. or U.S.A. - if(token.matches("^([A-Za-z-]\\.)+$")) { - outBuf.append(token + "\n" + suffix); - continue; - } - - // disambiguate periods - m = Pattern.compile("^(..*)\\.$").matcher(token); - if(m.matches() && !line.equals("...") - /* TODO:commented out because those are language-specific: && !(flags.contains(Flag.GALICIAN) && token.matches("^[0-9]+\\.$"))*/) { - token = m.group(1); - suffix = ".\n" + suffix; - /* TODO:commented out because those are language-specific + + // abbreviations of the form A. or U.S.A. + if(token.matches("^([A-Za-z-]\\.)+$")) { + outBuf.append(token + "\n" + suffix); + continue; + } + + // disambiguate periods + m = Pattern.compile("^(..*)\\.$").matcher(token); + if(m.matches() && !line.equals("...") + /* TODO:commented out because those are language-specific: && !(flags.contains(Flag.GALICIAN) && token.matches("^[0-9]+\\.$"))*/) { + token = m.group(1); + suffix = ".\n" + suffix; + /* TODO:commented out because those are language-specific if(abbreviations.contains(token)) { outBuf.append(token + "\n" + suffix); continue; }*/ - } - - // cut off clitics + } + + // cut off clitics + while(true) { + m = Pattern.compile("^(--)(.)").matcher(token); + if(!m.find()) + break; + + token = token.replaceAll("^(--)(.)", "$2"); + outBuf.append(m.group(1) + "\n"); + } + if(!PClitic.equals("")) { while(true) { - m = Pattern.compile("^(--)(.)").matcher(token); - - if(!m.find()) { + m = Pattern.compile("^(" + PClitic + ")(.)").matcher(token); + if(!m.find()) break; - } - - token = token.replaceAll("^(--)(.)", "$2"); + + token = token.replaceAll("^(" + PClitic + ")(.)", "$2"); outBuf.append(m.group(1) + "\n"); } - if(!PClitic.equals("")) { - while(true) { - m = Pattern.compile("^(" + PClitic + ")(.)").matcher(token); - - if(!m.find()) { - break; - } - - token = token.replaceAll("^(" + PClitic + ")(.)", "$2"); - outBuf.append(m.group(1) + "\n"); - } - } - + } + + while(true) { + m = Pattern.compile("^(--)(.)").matcher(token); + if(!m.find()) + break; + + token = token.replaceAll("^(--)(.)", "$1"); + suffix = m.group(2) + "\n" + suffix; + } + if(!FClitic.equals("")) { while(true) { - m = Pattern.compile("^(--)(.)").matcher(token); - - if(!m.find()) { + m = Pattern.compile("(.)(" + FClitic + ")$").matcher(token); + if(!m.find()) break; - } - - token = token.replaceAll("^(--)(.)", "$1"); + + token = token.replaceAll("(.)(" + FClitic + ")$", "$1"); suffix = m.group(2) + "\n" + suffix; } - if(!FClitic.equals("")) { - while(true) { - m = Pattern.compile("(.)(" + FClitic + ")$").matcher(token); - - if(!m.find()) { - break; - } - - token = token.replaceAll("(.)(" + FClitic + ")$", "$1"); - suffix = m.group(2) + "\n" + suffix; - } - } - outBuf.append(token + "\n" + suffix); } + outBuf.append(token + "\n" + suffix); } } } @@ -196,11 +189,11 @@ public List tokenize(JCas jcas) { // find the tokens in the original text and create token annotations LinkedList outList = new LinkedList(); String origText = jcas.getDocumentText(); - Integer origTextOffset = 0; + int origTextOffset = 0; for(String s : outBuf.toString().split("\n")) { - Integer begin = origText.indexOf(s, origTextOffset); - Integer end = begin + s.length(); + int begin = origText.indexOf(s, origTextOffset); + int end = begin + s.length(); Token t = new Token(jcas); t.setBegin(begin); @@ -219,17 +212,17 @@ public List tokenize(JCas jcas) { public List sentenceTokenize(JCas jcas) { List outList = new LinkedList(); - FSIterator tokIt = jcas.getAnnotationIndex(Token.type).iterator(); + AnnotationIndex tokens = jcas.getAnnotationIndex(Token.type); + FSIterator tokIt = tokens.iterator(); Sentence s = new Sentence(jcas); - Boolean sentenceStarted = false; + boolean sentenceStarted = false; Token tOld = null; Token t = null; while(tokIt.hasNext()) { - if (!(t == null)){ + if (t != null) tOld = t; - } - t = (Token) tokIt.next(); + t = tokIt.next(); // set sentence beginning if(sentenceStarted == false) { @@ -243,8 +236,7 @@ public List sentenceTokenize(JCas jcas) { */ if(!tokIt.hasNext() || (t.getCoveredText().matches("[.:!\\?]+") && - (!((tOld != null && tOld.getCoveredText().matches("[\\d]+")) || - ((jcas.getDocumentText().substring(t.getEnd()).length() > 2) && (jcas.getDocumentText().substring(t.getEnd(),t.getEnd()+3)).matches(" [A-Z][.-]")))))){ + !((tOld != null && tOld.getCoveredText().matches("[\\d]+")) || (jcas.getDocumentText().substring(t.getEnd()).length() > 2 && jcas.getDocumentText().substring(t.getEnd(),t.getEnd()+3).matches(" [A-Z][.-]"))))){ // ((!(tOld.getCoveredText().matches("[\\d]+")))) && (!((jcas.getDocumentText().substring(t.getEnd())).matches("^[\\s]*"))))) { // (t.getCoveredText().matches("[.:!\\?]+") && (!(tOld.getCoveredText().matches("[\\d]+"))))) { // das funktioniert ok sentenceStarted = false; @@ -252,7 +244,7 @@ public List sentenceTokenize(JCas jcas) { // check for whether the punctuation mark is followed by a closing quotation mark if(tokIt.hasNext()) { - Token tNext = (Token) tokIt.next(); + Token tNext = tokIt.next(); if(tNext.getCoveredText().matches("[»’'\"‛”‟›〞』」﹄"'」﹂]+")) { s.setEnd(tNext.getEnd()); diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/DocumentType.java b/src/de/unihd/dbs/uima/annotator/heideltime/DocumentType.java new file mode 100644 index 00000000..e3a5bc3b --- /dev/null +++ b/src/de/unihd/dbs/uima/annotator/heideltime/DocumentType.java @@ -0,0 +1,34 @@ +package de.unihd.dbs.uima.annotator.heideltime; + +/** + * Heideltime document types. + */ +public enum DocumentType { + COLLOQUIAL("colloquial"), NEWS("news"), NARRATIVE("narrative"), SCIENTIFIC("scientific"); + String name; + + DocumentType(String name) { + this.name = name; + } + + @Override + public String toString() { + return name; + } + + public static DocumentType of(String s) { + switch (s) { + case "colloquial": + return COLLOQUIAL; + case "news": + return NEWS; + case "narrative": + case "narratives": + return NARRATIVE; + case "scientific": + return SCIENTIFIC; + default: + throw new IllegalArgumentException("Unknown document type: " + s); + } + } +} \ No newline at end of file diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/HeidelTime.java b/src/de/unihd/dbs/uima/annotator/heideltime/HeidelTime.java index e94f96b2..c9055343 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/HeidelTime.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/HeidelTime.java @@ -14,92 +14,97 @@ package de.unihd.dbs.uima.annotator.heideltime; +import static de.unihd.dbs.uima.annotator.heideltime.utilities.ParseInteger.parseInt; +import static de.unihd.dbs.uima.annotator.heideltime.utilities.ParseInteger.parseIntAt; + import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.regex.MatchResult; +import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.uima.UimaContext; +import org.apache.uima.analysis_component.AnalysisComponent; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import de.unihd.dbs.uima.annotator.heideltime.ProcessorManager.Priority; import de.unihd.dbs.uima.annotator.heideltime.processors.TemponymPostprocessing; import de.unihd.dbs.uima.annotator.heideltime.resources.Language; import de.unihd.dbs.uima.annotator.heideltime.resources.NormalizationManager; import de.unihd.dbs.uima.annotator.heideltime.resources.RePatternManager; -import de.unihd.dbs.uima.annotator.heideltime.resources.RegexHashMap; +import de.unihd.dbs.uima.annotator.heideltime.resources.Rule; +import de.unihd.dbs.uima.annotator.heideltime.resources.RuleExpansion; import de.unihd.dbs.uima.annotator.heideltime.resources.RuleManager; -import de.unihd.dbs.uima.annotator.heideltime.utilities.DateCalculator; -import de.unihd.dbs.uima.annotator.heideltime.utilities.ContextAnalyzer; +import de.unihd.dbs.uima.annotator.heideltime.utilities.DurationSimplification; import de.unihd.dbs.uima.annotator.heideltime.utilities.LocaleException; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Toolbox; -import de.unihd.dbs.uima.types.heideltime.Dct; +import de.unihd.dbs.uima.annotator.heideltime.utilities.TokenBoundaryMatcher; import de.unihd.dbs.uima.types.heideltime.Sentence; import de.unihd.dbs.uima.types.heideltime.Timex3; import de.unihd.dbs.uima.types.heideltime.Token; - /** - * HeidelTime finds temporal expressions and normalizes them according to the TIMEX3 - * TimeML annotation standard. + * HeidelTime finds temporal expressions and normalizes them according to the TIMEX3 TimeML annotation standard. * * @author jannik stroetgen * */ public class HeidelTime extends JCasAnnotator_ImplBase { + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(HeidelTime.class); - // TOOL NAME (may be used as componentId) - private Class component = this.getClass(); - // PROCESSOR MANAGER private ProcessorManager procMan = new ProcessorManager(); // COUNTER (how many timexes added to CAS? (finally) - public int timex_counter = 0; + public int timex_counter = 0; public int timex_counter_global = 0; - - // FLAG (for historic expressions referring to BC) - public Boolean flagHistoricDates = false; - + // COUNTER FOR TIMEX IDS private int timexID = 0; - + // INPUT PARAMETER HANDLING WITH UIMA - private String PARAM_LANGUAGE = "Language"; - // supported languages (2012-05-19): english, german, dutch, englishcoll, englishsci - private String PARAM_TYPE_TO_PROCESS = "Type"; + private String PARAM_LANGUAGE = "Language"; + // supported languages (2012-05-19): english, german, dutch, englishcoll, + // englishsci + private String PARAM_TYPE_TO_PROCESS = "Type"; // chosen locale parameter name - private String PARAM_LOCALE = "locale"; - // supported types (2012-05-19): news (english, german, dutch), narrative (english, german, dutch), colloquial - private Language language = Language.ENGLISH; - private String typeToProcess = "news"; - + private String PARAM_LOCALE = "locale"; + // supported types (2012-05-19): news (english, german, dutch), narrative + // (english, german, dutch), colloquial + private Language language = Language.ENGLISH; + private DocumentType typeToProcess = DocumentType.NEWS; + // INPUT PARAMETER HANDLING WITH UIMA (which types shall be extracted) - private String PARAM_DATE = "Date"; - private String PARAM_TIME = "Time"; - private String PARAM_DURATION = "Duration"; - private String PARAM_SET = "Set"; + private String PARAM_DATE = "Date"; + private String PARAM_TIME = "Time"; + private String PARAM_DURATION = "Duration"; + private String PARAM_SET = "Set"; private String PARAM_TEMPONYMS = "Temponym"; - private String PARAM_DEBUG = "Debugging"; - private String PARAM_GROUP = "ConvertDurations"; - private Boolean find_dates = true; - private Boolean find_times = true; - private Boolean find_durations = true; - private Boolean find_sets = true; - private Boolean find_temponyms = false; - private Boolean group_gran = true; + private String PARAM_GROUP = "ConvertDurations"; + private boolean find_dates = true; + private boolean find_times = true; + private boolean find_durations = true; + private boolean find_sets = true; + private boolean find_temponyms = false; + private boolean group_gran = true; // FOR DEBUGGING PURPOSES (IF FALSE) - private Boolean deleteOverlapped = true; + private boolean deleteOverlapping = true; + + // Whether to generate "allTokIds" strings. + // Required for TempEval! + private boolean doAllTokIds = true; + private ResolveAmbiguousValues resolver; /** * @see AnalysisComponent#initialize(UimaContext) @@ -108,222 +113,204 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept super.initialize(aContext); ///////////////////////////////// - // DEBUGGING PARAMETER SETTING // - ///////////////////////////////// - this.deleteOverlapped = true; - Boolean doDebug = (Boolean) aContext.getConfigParameterValue(PARAM_DEBUG); - Logger.setPrintDetails(doDebug == null ? false : doDebug); - - ///////////////////////////////// - // HANDLE LOCALE // + // HANDLE LOCALE // ///////////////////////////////// String requestedLocale = (String) aContext.getConfigParameterValue(PARAM_LOCALE); - if(requestedLocale == null || requestedLocale.length() == 0) { // if the PARAM_LOCALE setting was left empty, - Locale.setDefault(Locale.UK); // use a default, the ISO8601-adhering UK locale (equivalent to "en_GB") - } else { // otherwise, check if the desired locale exists in the JVM's available locale repertoire + if (requestedLocale == null || requestedLocale.length() == 0) { + // if the PARAM_LOCALE setting was left empty, + Locale.setDefault(Locale.UK); + // use the ISO8601-adhering UK locale (equivalent to "en_GB") + } else { // otherwise, check if the desired locale exists in the JVM's + // available locale repertoire try { - Locale locale = DateCalculator.getLocaleFromString(requestedLocale); - Locale.setDefault(locale); // sets it for the entire JVM session + Locale locale = getLocaleFromString(requestedLocale); + Locale.setDefault(locale); // sets it for the entire JVM + // session } catch (LocaleException e) { - Logger.printError("Supplied locale parameter couldn't be resolved to a working locale. Try one of these:"); - String localesString = new String(); - for(Locale l : Locale.getAvailableLocales()) { // list all available locales - localesString += l.toString()+" "; + StringBuilder localesString = new StringBuilder(); + localesString.append("Supplied locale parameter couldn't be resolved to a working locale. Try one of these:"); + for (Locale l : Locale.getAvailableLocales()) { + // list all available locales + localesString.append(l.toString()).append(' '); } - Logger.printError(localesString); - System.exit(-1); + LOG.error(localesString.toString()); + System.exit(1); } } - + ////////////////////////////////// // GET CONFIGURATION PARAMETERS // ////////////////////////////////// language = Language.getLanguageFromString((String) aContext.getConfigParameterValue(PARAM_LANGUAGE)); - - typeToProcess = (String) aContext.getConfigParameterValue(PARAM_TYPE_TO_PROCESS); - find_dates = (Boolean) aContext.getConfigParameterValue(PARAM_DATE); - find_times = (Boolean) aContext.getConfigParameterValue(PARAM_TIME); + + typeToProcess = DocumentType.of((String) aContext.getConfigParameterValue(PARAM_TYPE_TO_PROCESS)); + find_dates = (Boolean) aContext.getConfigParameterValue(PARAM_DATE); + find_times = (Boolean) aContext.getConfigParameterValue(PARAM_TIME); find_durations = (Boolean) aContext.getConfigParameterValue(PARAM_DURATION); - find_sets = (Boolean) aContext.getConfigParameterValue(PARAM_SET); + find_sets = (Boolean) aContext.getConfigParameterValue(PARAM_SET); find_temponyms = (Boolean) aContext.getConfigParameterValue(PARAM_TEMPONYMS); - group_gran = (Boolean) aContext.getConfigParameterValue(PARAM_GROUP); + group_gran = (Boolean) aContext.getConfigParameterValue(PARAM_GROUP); //////////////////////////////////////////////////////////// // READ NORMALIZATION RESOURCES FROM FILES AND STORE THEM // //////////////////////////////////////////////////////////// NormalizationManager.getInstance(language, find_temponyms); - + ////////////////////////////////////////////////////// // READ PATTERN RESOURCES FROM FILES AND STORE THEM // ////////////////////////////////////////////////////// RePatternManager.getInstance(language, find_temponyms); - + /////////////////////////////////////////////////// // READ RULE RESOURCES FROM FILES AND STORE THEM // /////////////////////////////////////////////////// RuleManager.getInstance(language, find_temponyms); - - ///////////////////////////////////////////////////////////////////////////////// - // SUBPROCESSOR CONFIGURATION. REGISTER YOUR OWN PROCESSORS HERE FOR EXECUTION // - ///////////////////////////////////////////////////////////////////////////////// - procMan.registerProcessor("de.unihd.dbs.uima.annotator.heideltime.processors.HolidayProcessor"); - procMan.registerProcessor("de.unihd.dbs.uima.annotator.heideltime.processors.DecadeProcessor"); + + /////////////////////////////////////////////////////////////////// + // SUBPROCESSOR CONFIGURATION. REGISTER YOUR OWN PROCESSORS HERE // + /////////////////////////////////////////////////////////////////// + procMan.registerProcessor(de.unihd.dbs.uima.annotator.heideltime.processors.HolidayProcessor.class.getName()); + procMan.registerProcessor(de.unihd.dbs.uima.annotator.heideltime.processors.DecadeProcessor.class.getName()); procMan.initializeAllProcessors(aContext); - + ///////////////////////////// // PRINT WHAT WILL BE DONE // ///////////////////////////// - if (find_dates) Logger.printDetail("Getting Dates..."); - if (find_times) Logger.printDetail("Getting Times..."); - if (find_durations) Logger.printDetail("Getting Durations..."); - if (find_sets) Logger.printDetail("Getting Sets..."); - if (find_temponyms) Logger.printDetail("Getting Temponyms..."); + LOG.debug("Enabled modules:{}{}{}{}{}", // + find_dates ? " dates" : "", // + find_times ? " times" : "", // + find_durations ? " durations" : "", // + find_sets ? " sets" : "", // + find_temponyms ? " temponyms" : ""); + + if (resolver == null) + resolver = new ResolveAmbiguousValues(); + resolver.init(language, find_temponyms, typeToProcess); } - /** * @see JCasAnnotator_ImplBase#process(JCas) */ public void process(JCas jcas) { - // check whether a given DCT (if any) is of the correct format and if not, skip this call - if(!isValidDCT(jcas)) { - Logger.printError(component, "The reader component of this workflow has set an incorrect DCT." - + " HeidelTime expects either \"YYYYMMDD\" or \"YYYY-MM-DD...\". This document was skipped."); + // check whether a given DCT (if any) is of the correct format and if not,skip this call + if (!ResolveAmbiguousValues.ParsedDct.isValidDCT(jcas)) { + LOG.error("The reader component of this workflow has set an incorrect DCT.\n" + // + " HeidelTime expects either \"YYYYMMDD\" or \"YYYY-MM-DD...\", got \"{}\".\n" + // + "This document was skipped.", ResolveAmbiguousValues.ParsedDct.getDct(jcas)); return; } - + // run preprocessing processors procMan.executeProcessors(jcas, Priority.PREPROCESSING); - + RuleManager rulem = RuleManager.getInstance(language, find_temponyms); - + timexID = 1; // reset counter once per document processing timex_counter = 0; - flagHistoricDates = false; - + boolean flagHistoricDates = false; + //////////////////////////////////////////// // CHECK SENTENCE BY SENTENCE FOR TIMEXES // //////////////////////////////////////////// - FSIterator sentIter = jcas.getAnnotationIndex(Sentence.type).iterator(); - /* - * check if the pipeline has annotated any sentences. if not, heideltime can't do any work, - * will return from process() with a warning message. + AnnotationIndex sentences = jcas.getAnnotationIndex(Sentence.type); + /* + * check if the pipeline has annotated any sentences. if not, heideltime can't do any work, will return from process() with a warning message. */ - if(!sentIter.hasNext()) { - Logger.printError(component, "HeidelTime has not found any sentence tokens in this document. " + - "HeidelTime needs sentence tokens tagged by a preprocessing UIMA analysis engine to " + - "do its work. Please check your UIMA workflow and add an analysis engine that creates " + - "these sentence tokens."); + if (sentences.size() == 0) { + LOG.error("HeidelTime has not found any sentence tokens in this document. " + "HeidelTime needs sentence tokens tagged by a preprocessing UIMA analysis engine to " + + "do its work. Please check your UIMA workflow and add an analysis engine that creates " + "these sentence tokens."); } - - while (sentIter.hasNext()) { - Sentence s = (Sentence) sentIter.next(); - - Boolean debugIteration = false; - Boolean oldDebugState = Logger.getPrintDetails(); - do { - try { - if (find_dates) { - findTimexes("DATE", rulem.getHmDatePattern(), rulem.getHmDateOffset(), rulem.getHmDateNormalization(), s, jcas); - } - if (find_times) { - findTimexes("TIME", rulem.getHmTimePattern(), rulem.getHmTimeOffset(), rulem.getHmTimeNormalization(), s, jcas); - } - - /* - * check for historic dates/times starting with BC - * to check if post-processing step is required - */ - if (typeToProcess.equals("narrative") || typeToProcess.equals("narratives")){ - FSIterator iterDates = jcas.getAnnotationIndex(Timex3.type).iterator(); - while (iterDates.hasNext()){ - Timex3 t = (Timex3) iterDates.next(); - if (t.getTimexValue().startsWith("BC")){ - flagHistoricDates = true; - break; - } - } - } - - if (find_sets) { - findTimexes("SET", rulem.getHmSetPattern(), rulem.getHmSetOffset(), rulem.getHmSetNormalization(), s, jcas); - } - if (find_durations) { - findTimexes("DURATION", rulem.getHmDurationPattern(), rulem.getHmDurationOffset(), rulem.getHmDurationNormalization(), s, jcas); - } - if (find_temponyms) { - findTimexes("TEMPONYM", rulem.getHmTemponymPattern(), rulem.getHmTemponymOffset(), rulem.getHmTemponymNormalization(), s, jcas); - } - } catch(NullPointerException npe) { - if(!debugIteration) { - debugIteration = true; - Logger.setPrintDetails(true); - - Logger.printError(component, "HeidelTime's execution has been interrupted by an exception that " + - "is likely rooted in faulty normalization resource files. Please consider opening an issue " + - "report containing the following information at our GitHub project issue tracker: " + - "https://github.com/HeidelTime/heideltime/issues - Thanks!"); - npe.printStackTrace(); - Logger.printError(component, "Sentence [" + s.getBegin() + "-" + s.getEnd() + "]: " + s.getCoveredText()); - Logger.printError(component, "Language: " + language); - Logger.printError(component, "Re-running this sentence with DEBUGGING enabled..."); - } else { - debugIteration = false; - Logger.setPrintDetails(oldDebugState); - - Logger.printError(component, "Execution will now resume."); - } - } - } while(debugIteration); + + TokenBoundaryMatcher matcher = new TokenBoundaryMatcher(); + for (Sentence s : sentences) { + try { + final CharSequence coveredText = TokenBoundaryMatcher.simplifyString(s.getCoveredText()); + if (LOG.isTraceEnabled()) + LOG.trace("Sentence {}: {}", s.getSentenceId(), coveredText); + + // Build a list of "good" token positions to anchor matches: + matcher.tokenBoundaries(coveredText, s, jcas); + + if (find_dates) + findTimexes("DATE", rulem.getHmDateRules(), matcher, s, jcas, coveredText); + if (find_times) + findTimexes("TIME", rulem.getHmTimeRules(), matcher, s, jcas, coveredText); + + /* + * check for historic dates/times starting with BC to check if post-processing step is required + */ + if (typeToProcess == DocumentType.NARRATIVE) { + AnnotationIndex dates = jcas.getAnnotationIndex(Timex3.type); + for (Timex3 t : dates) + if (t.getTimexValue().startsWith("BC")) { + flagHistoricDates = true; + break; + } + } + + if (find_sets) + findTimexes("SET", rulem.getHmSetRules(), matcher, s, jcas, coveredText); + if (find_durations) + findTimexes("DURATION", rulem.getHmDurationRules(), matcher, s, jcas, coveredText); + if (find_temponyms) + findTimexes("TEMPONYM", rulem.getHmTemponymRules(), matcher, s, jcas, coveredText); + } catch (NullPointerException npe) { + LOG.error("HeidelTime's execution has been interrupted by an exception that " + "is likely rooted in faulty normalization resource files. " + + "Please consider opening an issue report containing the following " + + "information at our GitHub project issue tracker (if possible, also enable debug logging): " + + "https://github.com/HeidelTime/heideltime/issues - Thanks!", npe); + LOG.error("Sentence [" + s.getBegin() + "-" + s.getEnd() + "]: " + s.getCoveredText()); + LOG.error("Language: " + language); + // LOG.error("Re-running this sentence with DEBUGGING + // enabled..."); + // TODO: add a flag to force-log debugging information? + } } /* * kick out some overlapping expressions */ - if (deleteOverlapped == true) - deleteOverlappedTimexesPreprocessing(jcas); + if (deleteOverlapping) + deleteOverlappingTimexesPreprocessing(jcas); /* - * specify ambiguous values, e.g.: specific year for date values of - * format UNDEF-year-01-01; specific month for values of format UNDEF-last-month + * specify ambiguous values, e.g.: specific year for date values of format UNDEF-year-01-01; specific month for values of format UNDEF-last-month */ - specifyAmbiguousValues(jcas); - + if (resolver != null) + resolver.specifyAmbiguousValues(jcas); + // disambiguate historic dates // check dates without explicit hints to AD or BC if they might refer to BC dates if (flagHistoricDates) try { disambiguateHistoricDates(jcas); - } catch(Exception e) { - Logger.printError("Something went wrong disambiguating historic dates."); - e.printStackTrace(); + } catch (Exception e) { + LOG.error("Failed disambiguating historic dates: {}", e.getMessage(), e); } - if (find_temponyms) { + if (find_temponyms) TemponymPostprocessing.handleIntervals(jcas); - } - + /* * kick out the rest of the overlapping expressions */ - if (deleteOverlapped == true) + if (deleteOverlapping) deleteOverlappedTimexesPostprocessing(jcas); - + // run arbitrary processors procMan.executeProcessors(jcas, Priority.ARBITRARY); - + // remove invalid timexes removeInvalids(jcas); - + // run postprocessing processors procMan.executeProcessors(jcas, Priority.POSTPROCESSING); - timex_counter_global = timex_counter_global + timex_counter; - Logger.printDetail(component, "Number of Timexes added to CAS: "+timex_counter + "(global: "+timex_counter_global+")"); + timex_counter_global += timex_counter; + LOG.info("Number of Timexes added to CAS: {} (global: {})", timex_counter, timex_counter_global); } - /** * Add timex annotation to CAS object. * @@ -335,1791 +322,386 @@ public void process(JCas jcas) { * @param foundByRule * @param jcas */ - public void addTimexAnnotation(String timexType, int begin, int end, Sentence sentence, String timexValue, String timexQuant, - String timexFreq, String timexMod, String emptyValue, String timexId, String foundByRule, JCas jcas) { - + public void addTimexAnnotation(String timexType, int begin, int end, Sentence sentence, String timexValue, String timexQuant, String timexFreq, String timexMod, String emptyValue, + String timexId, String foundByRule, JCas jcas) { + Timex3 annotation = new Timex3(jcas); annotation.setBegin(begin); annotation.setEnd(end); annotation.setFilename(sentence.getFilename()); annotation.setSentId(sentence.getSentenceId()); - + annotation.setEmptyValue(emptyValue); - FSIterator iterToken = jcas.getAnnotationIndex(Token.type).subiterator(sentence); - String allTokIds = ""; - while (iterToken.hasNext()) { - Token tok = (Token) iterToken.next(); - if (tok.getBegin() <= begin && tok.getEnd() > begin) { - annotation.setFirstTokId(tok.getTokenId()); - allTokIds = "BEGIN<-->" + tok.getTokenId(); - } - if ((tok.getBegin() > begin) && (tok.getEnd() <= end)) { - allTokIds = allTokIds + "<-->" + tok.getTokenId(); + AnnotationIndex tokens = jcas.getAnnotationIndex(Token.type); + if (doAllTokIds) { + StringBuilder allTokIds = new StringBuilder(); + for (FSIterator iterToken = tokens.subiterator(sentence); iterToken.hasNext();) { + Token tok = iterToken.next(); + if (tok.getBegin() <= begin && tok.getEnd() > begin) { + annotation.setFirstTokId(tok.getTokenId()); + allTokIds.setLength(0); + allTokIds.append("BEGIN<-->").append(tok.getTokenId()); + } + if ((tok.getBegin() > begin) && (tok.getEnd() <= end)) + allTokIds.append("<-->").append(tok.getTokenId()); } + annotation.setAllTokIds(allTokIds.toString()); } - annotation.setAllTokIds(allTokIds); annotation.setTimexType(timexType); annotation.setTimexValue(timexValue); annotation.setTimexId(timexId); annotation.setFoundByRule(foundByRule); - if ((timexType.equals("DATE")) || (timexType.equals("TIME"))) { - if ((timexValue.startsWith("X")) || (timexValue.startsWith("UNDEF"))) { - annotation.setFoundByRule(foundByRule+"-relative"); + if (timexType.equals("DATE") || timexType.equals("TIME")) { + if (timexValue.startsWith("X") || timexValue.startsWith("UNDEF")) { + annotation.setFoundByRule(foundByRule + "-relative"); } else { - annotation.setFoundByRule(foundByRule+"-explicit"); + annotation.setFoundByRule(foundByRule + "-explicit"); } } - if (!(timexQuant == null)) { + if (timexQuant != null) annotation.setTimexQuant(timexQuant); - } - if (!(timexFreq == null)) { + if (timexFreq != null) annotation.setTimexFreq(timexFreq); - } - if (!(timexMod == null)) { + if (timexMod != null) annotation.setTimexMod(timexMod); - } annotation.addToIndexes(); this.timex_counter++; - - Logger.printDetail(annotation.getTimexId()+"EXTRACTION PHASE: "+" found by:"+annotation.getFoundByRule()+" text:"+annotation.getCoveredText()); - Logger.printDetail(annotation.getTimexId()+"NORMALIZATION PHASE:"+" found by:"+annotation.getFoundByRule()+" text:"+annotation.getCoveredText()+" value:"+annotation.getTimexValue()); - + + if (LOG.isTraceEnabled()) { + LOG.trace(annotation.getTimexId() + " EXTRACTION PHASE: " + " found by:" + annotation.getFoundByRule() + " text:" + annotation.getCoveredText()); + LOG.trace(annotation.getTimexId() + " NORMALIZATION PHASE:" + " found by:" + annotation.getFoundByRule() + " text:" + annotation.getCoveredText() + " value:" + + annotation.getTimexValue()); + } } - /** - * Postprocessing: Check dates starting with "0" which were extracted without - * explicit "AD" hints if it is likely that they refer to the respective date BC + * Postprocessing: Check dates starting with "0" which were extracted without explicit "AD" hints if it is likely that they refer to the respective date BC * * @param jcas */ - public void disambiguateHistoricDates(JCas jcas){ - + public void disambiguateHistoricDates(JCas jcas) { // build up a list with all found TIMEX expressions - List linearDates = new ArrayList(); - FSIterator iterTimex = jcas.getAnnotationIndex(Timex3.type).iterator(); + AnnotationIndex annotations = jcas.getAnnotationIndex(Timex3.type); // Create List of all Timexes of types "date" and "time" - while (iterTimex.hasNext()) { - Timex3 timex = (Timex3) iterTimex.next(); - if (timex.getTimexType().equals("DATE") || timex.getTimexType().equals("TIME")) { + List linearDates = new ArrayList(); + for (Timex3 timex : annotations) + if (timex.getTimexType().equals("DATE") || timex.getTimexType().equals("TIME")) linearDates.add(timex); - } - } - - ////////////////////////////////////////////// - // go through list of Date and Time timexes // - ////////////////////////////////////////////// + + ////////////////////////////////////////////// + // go through list of Date and Time timexes // + ////////////////////////////////////////////// for (int i = 1; i < linearDates.size(); i++) { - Timex3 t_i = (Timex3) linearDates.get(i); - String value_i = t_i.getTimexValue(); - String newValue = value_i; - Boolean change = false; - if (!(t_i.getFoundByRule().contains("-BCADhint"))){ - if (value_i.startsWith("0")){ - Integer offset = 1, counter = 1; - do { - if ((i == 1 || (i > 1 && !change)) && linearDates.get(i-offset).getTimexValue().startsWith("BC")){ - if (value_i.length()>1){ - if ((linearDates.get(i-offset).getTimexValue().startsWith("BC"+value_i.substring(0,2))) || - (linearDates.get(i-offset).getTimexValue().startsWith("BC"+String.format("%02d",(Integer.parseInt(value_i.substring(0,2))+1))))){ - if (((value_i.startsWith("00")) && (linearDates.get(i-offset).getTimexValue().startsWith("BC00"))) || - ((value_i.startsWith("01")) && (linearDates.get(i-offset).getTimexValue().startsWith("BC01")))){ - if ((value_i.length()>2) && (linearDates.get(i-offset).getTimexValue().length()>4)){ - if (Integer.parseInt(value_i.substring(0,3)) <= Integer.parseInt(linearDates.get(i-offset).getTimexValue().substring(2,5))){ - newValue = "BC" + value_i; - change = true; - Logger.printDetail("DisambiguateHistoricDates: "+value_i+" to "+newValue+". Expression "+t_i.getCoveredText()+" due to "+linearDates.get(i-offset).getCoveredText()); - } - } - } - else{ + Timex3 t_i = linearDates.get(i); + if (t_i.getFoundByRule().contains("-BCADhint")) + continue; + String value_i = t_i.getTimexValue(), newValue = value_i; + if (value_i.charAt(0) != '0') + continue; + boolean change = false; + int offset = 1, counter = 1; + do { + String txval = linearDates.get(i - offset).getTimexValue(); + if ((i == 1 || (i > 1 && !change)) && txval.startsWith("BC")) { + if (value_i.length() > 1) { + if (txval.startsWith("BC" + value_i.substring(0, 2)) // + || txval.startsWith(String.format("BC%02d", parseInt(value_i, 0, 2) + 1))) { + if ((value_i.startsWith("00") && txval.startsWith("BC00")) || (value_i.startsWith("01") && txval.startsWith("BC01"))) { + if ((value_i.length() > 2) && (txval.length() > 4)) { + if (parseInt(value_i, 0, 3) <= parseInt(txval, 2, 5)) { newValue = "BC" + value_i; change = true; - Logger.printDetail("DisambiguateHistoricDates: "+value_i+" to "+newValue+". Expression "+t_i.getCoveredText()+" due to "+linearDates.get(i-offset).getCoveredText()); + if (LOG.isDebugEnabled()) + LOG.debug("DisambiguateHistoricDates: " + value_i + " to " + newValue + ". Expression " + t_i.getCoveredText() + + " due to " + linearDates.get(i - offset).getCoveredText()); } } - } - } - - if ((linearDates.get(i-offset).getTimexType().equals("TIME") || linearDates.get(i-offset).getTimexType().equals("DATE")) && - (linearDates.get(i-offset).getTimexValue().matches("^\\d.*"))) { - counter++; + } else { + newValue = "BC" + value_i; + change = true; + if (LOG.isDebugEnabled()) + LOG.debug("DisambiguateHistoricDates: " + value_i + " to " + newValue + ". Expression " + t_i.getCoveredText() + " due to " + + linearDates.get(i - offset).getCoveredText()); + } } - } while (counter < 5 && ++offset < i); + } } - } - if (!(newValue.equals(value_i))){ + + String txtype = linearDates.get(i - offset).getTimexType(); + if ((txtype.equals("TIME") || txtype.equals("DATE")) && txval.matches("^\\d.*")) { + counter++; + } + } while (counter < 5 && ++offset < i); + if (!newValue.equals(value_i)) { t_i.removeFromIndexes(); - Logger.printDetail("DisambiguateHistoricDates: value changed to BC"); + LOG.debug("DisambiguateHistoricDates: value changed to BC"); t_i.setTimexValue(newValue); t_i.addToIndexes(); linearDates.set(i, t_i); } - } + } } - + /** - * Postprocessing: Remove invalid timex expressions. These are already - * marked as invalid: timexValue().equals("REMOVE") + * Postprocessing: Remove invalid timex expressions. These are already marked as invalid: timexValue().equals("REMOVE") * * @param jcas */ public void removeInvalids(JCas jcas) { - /* - * Iterate over timexes and add invalids to HashSet - * (invalids cannot be removed directly since iterator is used) + * Iterate over timexes and add invalids to HashSet (invalids cannot be removed directly since iterator is used) */ - FSIterator iterTimex = jcas.getAnnotationIndex(Timex3.type).iterator(); + AnnotationIndex timexes = jcas.getAnnotationIndex(Timex3.type); HashSet hsTimexToRemove = new HashSet(); - while (iterTimex.hasNext()) { - Timex3 timex = (Timex3) iterTimex.next(); - if (timex.getTimexValue().equals("REMOVE")) { + for (Timex3 timex : timexes) + if (timex.getTimexValue().equals("REMOVE")) hsTimexToRemove.add(timex); - } - } // remove invalids, finally for (Timex3 timex3 : hsTimexToRemove) { timex3.removeFromIndexes(); this.timex_counter--; - Logger.printDetail(timex3.getTimexId()+" REMOVING PHASE: "+"found by:"+timex3.getFoundByRule()+" text:"+timex3.getCoveredText()+" value:"+timex3.getTimexValue()); + if (LOG.isDebugEnabled()) + LOG.debug("{} REMOVING PHASE: found by: {} text:{} value:{}", timex3.getTimexId(), timex3.getFoundByRule(), timex3.getCoveredText(), timex3.getTimexValue()); } } - @SuppressWarnings("unused") - public String specifyAmbiguousValuesString(String ambigString, Timex3 t_i, Integer i, List linearDates, JCas jcas) { - NormalizationManager norm = NormalizationManager.getInstance(language, find_temponyms); - - // ////////////////////////////////////// - // IS THERE A DOCUMENT CREATION TIME? // - // ////////////////////////////////////// - boolean dctAvailable = false; - - // //////////////////////////// - // DOCUMENT TYPE TO PROCESS // - // ////////////////////////// - boolean documentTypeNews = false; - boolean documentTypeNarrative = false; - boolean documentTypeColloquial = false; - boolean documentTypeScientific = false; - if (typeToProcess.equals("news")) { - documentTypeNews = true; - } - if (typeToProcess.equals("narrative") - || typeToProcess.equals("narratives")) { - documentTypeNarrative = true; - } - if (typeToProcess.equals("colloquial")) { - documentTypeColloquial = true; - } - if (typeToProcess.equals("scientific")) { - documentTypeScientific = true; - } - - // get the dct information - String dctValue = ""; - int dctCentury = 0; - int dctYear = 0; - int dctDecade = 0; - int dctMonth = 0; - int dctDay = 0; - String dctSeason = ""; - String dctQuarter = ""; - String dctHalf = ""; - int dctWeekday = 0; - int dctWeek = 0; - - // //////////////////////////////////////////// - // INFORMATION ABOUT DOCUMENT CREATION TIME // - // //////////////////////////////////////////// - FSIterator dctIter = jcas.getAnnotationIndex(Dct.type).iterator(); - if (dctIter.hasNext()) { - dctAvailable = true; - Dct dct = (Dct) dctIter.next(); - dctValue = dct.getValue(); - // year, month, day as mentioned in the DCT - if (dctValue.matches("\\d\\d\\d\\d\\d\\d\\d\\d")) { - dctCentury = Integer.parseInt(dctValue.substring(0, 2)); - dctYear = Integer.parseInt(dctValue.substring(0, 4)); - dctDecade = Integer.parseInt(dctValue.substring(2, 3)); - dctMonth = Integer.parseInt(dctValue.substring(4, 6)); - dctDay = Integer.parseInt(dctValue.substring(6, 8)); - - Logger.printDetail("dctCentury:" + dctCentury); - Logger.printDetail("dctYear:" + dctYear); - Logger.printDetail("dctDecade:" + dctDecade); - Logger.printDetail("dctMonth:" + dctMonth); - Logger.printDetail("dctDay:" + dctDay); - } else { - dctCentury = Integer.parseInt(dctValue.substring(0, 2)); - dctYear = Integer.parseInt(dctValue.substring(0, 4)); - dctDecade = Integer.parseInt(dctValue.substring(2, 3)); - dctMonth = Integer.parseInt(dctValue.substring(5, 7)); - dctDay = Integer.parseInt(dctValue.substring(8, 10)); - - Logger.printDetail("dctCentury:" + dctCentury); - Logger.printDetail("dctYear:" + dctYear); - Logger.printDetail("dctDecade:" + dctDecade); - Logger.printDetail("dctMonth:" + dctMonth); - Logger.printDetail("dctDay:" + dctDay); - } - dctQuarter = "Q" - + norm.getFromNormMonthInQuarter(norm - .getFromNormNumber(dctMonth + "")); - dctHalf = "H1"; - if (dctMonth > 6) { - dctHalf = "H2"; - } - - // season, week, weekday, have to be calculated - dctSeason = norm.getFromNormMonthInSeason(norm - .getFromNormNumber(dctMonth + "") + ""); - dctWeekday = DateCalculator.getWeekdayOfDate(dctYear + "-" - + norm.getFromNormNumber(dctMonth + "") + "-" - + norm.getFromNormNumber(dctDay + "")); - dctWeek = DateCalculator.getWeekOfDate(dctYear + "-" - + norm.getFromNormNumber(dctMonth + "") + "-" - + norm.getFromNormNumber(dctDay + "")); - - Logger.printDetail("dctQuarter:" + dctQuarter); - Logger.printDetail("dctSeason:" + dctSeason); - Logger.printDetail("dctWeekday:" + dctWeekday); - Logger.printDetail("dctWeek:" + dctWeek); - } else { - Logger.printDetail("No DCT available..."); - } - - // check if value_i has month, day, season, week (otherwise no UNDEF-year is possible) - Boolean viHasMonth = false; - Boolean viHasDay = false; - Boolean viHasSeason = false; - Boolean viHasWeek = false; - Boolean viHasQuarter = false; - Boolean viHasHalf = false; - int viThisMonth = 0; - int viThisDay = 0; - String viThisSeason = ""; - String viThisQuarter = ""; - String viThisHalf = ""; - String[] valueParts = ambigString.split("-"); - // check if UNDEF-year or UNDEF-century - if ((ambigString.startsWith("UNDEF-year")) || (ambigString.startsWith("UNDEF-century"))) { - if (valueParts.length > 2) { - // get vi month - if (valueParts[2].matches("\\d\\d")) { - viHasMonth = true; - viThisMonth = Integer.parseInt(valueParts[2]); - } - // get vi season - else if ((valueParts[2].equals("SP")) || (valueParts[2].equals("SU")) || (valueParts[2].equals("FA")) || (valueParts[2].equals("WI"))) { - viHasSeason = true; - viThisSeason = valueParts[2]; - } - // get v1 quarter - else if ((valueParts[2].equals("Q1")) || (valueParts[2].equals("Q2")) || (valueParts[2].equals("Q3")) || (valueParts[2].equals("Q4"))) { - viHasQuarter = true; - viThisQuarter = valueParts[2]; - } - // get v1 half - else if ((valueParts[2].equals("H1")) || (valueParts[2].equals("H2"))) { - viHasHalf = true; - viThisHalf = valueParts[2]; - } - // get vi day - if ((valueParts.length > 3) && (valueParts[3].matches("\\d\\d"))) { - viHasDay = true; - viThisDay = Integer.parseInt(valueParts[3]); - } + /** + * @param jcas + */ + private void deleteOverlappingTimexesPreprocessing(JCas jcas) { + AnnotationIndex timexes = jcas.getAnnotationIndex(Timex3.type); + HashSet hsTimexesToRemove = new HashSet(); + for (Timex3 t1 : timexes) { + if (hsTimexesToRemove.contains(t1)) { + continue; } - } - else { - if (valueParts.length > 1) { - // get vi month - if (valueParts[1].matches("\\d\\d")) { - viHasMonth = true; - viThisMonth = Integer.parseInt(valueParts[1]); + for (Timex3 t2 : timexes) { + if (t1 == t2 || hsTimexesToRemove.contains(t2)) { + continue; } - // get vi season - else if ((valueParts[1].equals("SP")) || (valueParts[1].equals("SU")) || (valueParts[1].equals("FA")) || (valueParts[1].equals("WI"))) { - viHasSeason = true; - viThisSeason = valueParts[1]; - } - // get v1 quarter - else if ((valueParts[1].equals("Q1")) || (valueParts[1].equals("Q2")) || (valueParts[1].equals("Q3")) || (valueParts[1].equals("Q4"))) { - viHasQuarter = true; - viThisQuarter = valueParts[1]; - } - // get v1 half - else if ((valueParts[1].equals("H1")) || (valueParts[1].equals("H2"))) { - viHasHalf = true; - viThisHalf = valueParts[1]; - } - // get vi day - if ((valueParts.length > 2) && (valueParts[2].matches("\\d\\d"))) { - viHasDay = true; - viThisDay = Integer.parseInt(valueParts[2]); - } - } - } - // get the last tense (depending on the part of speech tags used in front or behind the expression) - String last_used_tense = ContextAnalyzer.getLastTense(t_i, jcas, language); - - ////////////////////////// - // DISAMBIGUATION PHASE // - ////////////////////////// - - //////////////////////////////////////////////////// - // IF YEAR IS COMPLETELY UNSPECIFIED (UNDEF-year) // - //////////////////////////////////////////////////// - String valueNew = ambigString; - if (ambigString.startsWith("UNDEF-year")) { - String newYearValue = dctYear+""; - // vi has month (ignore day) - if ((viHasMonth == true) && (viHasSeason == false)) { - // WITH DOCUMENT CREATION TIME - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - // Tense is FUTURE - if ((last_used_tense.equals("FUTURE")) || (last_used_tense.equals("PRESENTFUTURE"))) { - // if dct-month is larger than vi-month, than add 1 to dct-year - if (dctMonth > viThisMonth) { - int intNewYear = dctYear + 1; - newYearValue = intNewYear + ""; - } - } - // Tense is PAST - if ((last_used_tense.equals("PAST"))) { - // if dct-month is smaller than vi month, than substrate 1 from dct-year - if (dctMonth < viThisMonth) { - int intNewYear = dctYear - 1; - newYearValue = intNewYear + ""; - } - } + if ( // t1 starts inside or with t2 and ends before t2 -> remove t1 + ((t1.getBegin() >= t2.getBegin()) && (t1.getEnd() < t2.getEnd())) + // t1 starts inside t2 and ends with or before t2 -> remove t1 + || ((t1.getBegin() > t2.getBegin()) && (t1.getEnd() <= t2.getEnd()))) { + logRemove(t1, "overlaps and begins later than", t2); + hsTimexesToRemove.add(t1); + continue; } - // WITHOUT DOCUMENT CREATION TIME - else { - newYearValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language); + // t2 starts inside or with t1 and ends before t1 -> remove t2 + if (((t2.getBegin() >= t1.getBegin()) && (t2.getEnd() < t1.getEnd())) + // t2 starts inside t1 and ends with or before t1 -> remove t2 + || ((t2.getBegin() > t1.getBegin()) && (t2.getEnd() <= t1.getEnd()))) { + logRemove(t2, "overlaps and begins later than", t1); + hsTimexesToRemove.add(t2); + continue; } - } - // vi has quaurter - if (viHasQuarter == true) { - // WITH DOCUMENT CREATION TIME - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - // Tense is FUTURE - if ((last_used_tense.equals("FUTURE")) || (last_used_tense.equals("PRESENTFUTURE"))) { - if (Integer.parseInt(dctQuarter.substring(1)) < Integer.parseInt(viThisQuarter.substring(1))) { - int intNewYear = dctYear + 1; - newYearValue = intNewYear + ""; - } - } - // Tense is PAST - if ((last_used_tense.equals("PAST"))) { - if (Integer.parseInt(dctQuarter.substring(1)) < Integer.parseInt(viThisQuarter.substring(1))) { - int intNewYear = dctYear - 1; - newYearValue = intNewYear + ""; - } - } - // IF NO TENSE IS FOUND - if (last_used_tense.equals("")){ - if (documentTypeColloquial){ - // IN COLLOQUIAL: future temporal expressions - if (Integer.parseInt(dctQuarter.substring(1)) < Integer.parseInt(viThisQuarter.substring(1))){ - int intNewYear = dctYear + 1; - newYearValue = intNewYear + ""; - } - } - else{ - // IN NEWS: past temporal expressions - if (Integer.parseInt(dctQuarter.substring(1)) < Integer.parseInt(viThisQuarter.substring(1))){ - int intNewYear = dctYear - 1; - newYearValue = intNewYear + ""; - } - } + // identical length + if ((t1.getBegin() == t2.getBegin()) && (t1.getEnd() == t2.getEnd())) { + if (t1.getTimexValue().startsWith("UNDEF") && !t2.getTimexValue().startsWith("UNDEF")) { + logRemove(t1, "is UNDEF, compared to", t2); + hsTimexesToRemove.add(t1); + } else if (!t1.getTimexValue().startsWith("UNDEF") && t2.getTimexValue().startsWith("UNDEF")) { + logRemove(t2, "is UNDEF, compared to", t1); + hsTimexesToRemove.add(t2); } - } - // WITHOUT DOCUMENT CREATION TIME - else { - newYearValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language); - } - } - // vi has half - if (viHasHalf == true) { - // WITH DOCUMENT CREATION TIME - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - // Tense is FUTURE - if ((last_used_tense.equals("FUTURE")) || (last_used_tense.equals("PRESENTFUTURE"))) { - if (Integer.parseInt(dctHalf.substring(1)) < Integer.parseInt(viThisHalf.substring(1))) { - int intNewYear = dctYear + 1; - newYearValue = intNewYear + ""; - } + // t1 is explicit, but t2 is not + else if (t1.getFoundByRule().endsWith("explicit") && !t2.getFoundByRule().endsWith("explicit")) { + logRemove(t2, "is not explicit, compared to", t1); + hsTimexesToRemove.add(t2); } - // Tense is PAST - if ((last_used_tense.equals("PAST"))) { - if (Integer.parseInt(dctHalf.substring(1)) < Integer.parseInt(viThisHalf.substring(1))) { - int intNewYear = dctYear - 1; - newYearValue = intNewYear + ""; - } + // remove timexes that are identical, but one has an emptyvalue + else if (t2.getEmptyValue().isEmpty() && !t1.getEmptyValue().isEmpty()) { + logRemove(t2, "has emptyvalue, compared to", t1); + hsTimexesToRemove.add(t2); } - // IF NO TENSE IS FOUND - if (last_used_tense.equals("")){ - if (documentTypeColloquial){ - // IN COLLOQUIAL: future temporal expressions - if (Integer.parseInt(dctHalf.substring(1)) < Integer.parseInt(viThisHalf.substring(1))){ - int intNewYear = dctYear + 1; - newYearValue = intNewYear + ""; - } - } - else{ - // IN NEWS: past temporal expressions - if (Integer.parseInt(dctHalf.substring(1)) < Integer.parseInt(viThisHalf.substring(1))){ - int intNewYear = dctYear - 1; - newYearValue = intNewYear + ""; - } - } + // REMOVE REAL DUPLICATES (the one with the lower timexID) + else if (parseIntAt(t1.getTimexId(), 1) < parseIntAt(t2.getTimexId(), 1)) { + logRemove(t1, "has lower id value than", t2); + hsTimexesToRemove.add(t1); } } - // WITHOUT DOCUMENT CREATION TIME - else { - newYearValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language); - } } - - // vi has season - if ((viHasMonth == false) && (viHasDay == false) && (viHasSeason == true)) { - // TODO check tenses? - // WITH DOCUMENT CREATION TIME - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - newYearValue = dctYear+""; - } - // WITHOUT DOCUMENT CREATION TIME - else { - newYearValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language); - } + } + // remove, finally + for (Timex3 t : hsTimexesToRemove) { + t.removeFromIndexes(); + timex_counter--; + } + } + + private void logRemove(Timex3 t1, String reason, Timex3 t2) { + if (LOG.isTraceEnabled()) { + LOG.trace("DUPLICATE: {} (id:{} value:{} found by:{}) removed because it {} {} (id:{} value:{} found by:{})", // + t1.getCoveredText(), t1.getTimexId(), t1.getTimexValue(), t1.getFoundByRule(), // + reason, // + t2.getCoveredText(), t2.getTimexId(), t2.getTimexValue(), t2.getFoundByRule()); + } + } + + private void deleteOverlappedTimexesPostprocessing(JCas jcas) { + AnnotationIndex timexes = jcas.getAnnotationIndex(Timex3.type); + HashSet> effectivelyToInspect = new HashSet>(); + ArrayList allTimexesToInspect = new ArrayList(); + for (Timex3 myTimex : timexes) { + ArrayList timexSet = new ArrayList(); + if (!myTimex.getTimexType().equals("TEMPONYM")) { + timexSet.add(myTimex); } - // vi has week - if (viHasWeek) { - // WITH DOCUMENT CREATION TIME - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - newYearValue = dctYear+""; - } - // WITHOUT DOCUMENT CREATION TIME - else { - newYearValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language); + + // compare this timex to all other timexes and mark those that + // have an overlap + for (Timex3 myInnerTimex : timexes) { + if (!myInnerTimex.getTimexType().equals("TEMPONYM")) { + if (// timex1 starts, timex2 is partial overlap + (myTimex.getBegin() <= myInnerTimex.getBegin() && myTimex.getEnd() > myInnerTimex.getBegin()) || + // same as above, but in reverse + (myInnerTimex.getBegin() <= myTimex.getBegin() && myInnerTimex.getEnd() > myTimex.getBegin()) || + // timex 1 is contained within or identical to timex2 + (myInnerTimex.getBegin() <= myTimex.getBegin() && myTimex.getEnd() <= myInnerTimex.getEnd()) || + // same as above, but in reverse + (myTimex.getBegin() <= myInnerTimex.getBegin() && myInnerTimex.getEnd() <= myTimex.getEnd())) { + + // increase the set + timexSet.add(myInnerTimex); + // note that these timexes are being looked at + allTimexesToInspect.add(myTimex); + allTimexesToInspect.add(myInnerTimex); + } } } - // REPLACE THE UNDEF-YEAR WITH THE NEWLY CALCULATED YEAR AND ADD TIMEX TO INDEXES - if (newYearValue.equals("")) { - valueNew = ambigString.replaceFirst("UNDEF-year", "XXXX"); - } - else { - valueNew = ambigString.replaceFirst("UNDEF-year", newYearValue); + // if overlaps with myTimex were detected, memorize them + if (timexSet.size() > 1) + effectivelyToInspect.add(timexSet); + } + + /* + * prune those sets of overlapping timexes that are subsets of others (i.e. leave only the largest union of overlapping timexes) + */ + HashSet> newEffectivelyToInspect = new HashSet>(); + for (Timex3 t : allTimexesToInspect) { + ArrayList setToKeep = new ArrayList(); + + // determine the largest set that contains this timex + for (ArrayList tSet : effectivelyToInspect) { + if (tSet.contains(t) && tSet.size() > setToKeep.size()) + setToKeep = tSet; } + + newEffectivelyToInspect.add(setToKeep); } + // overwrite previous list of sets + effectivelyToInspect = newEffectivelyToInspect; - /////////////////////////////////////////////////// - // just century is unspecified (UNDEF-century86) // - /////////////////////////////////////////////////// - else if ((ambigString.startsWith("UNDEF-century"))) { - String newCenturyValue = dctCentury+""; - - // NEWS and COLLOQUIAL DOCUMENTS - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && !ambigString.equals("UNDEF-century")) { - int viThisDecade = Integer.parseInt(ambigString.substring(13, 14)); - - Logger.printDetail("dctCentury"+dctCentury); - - newCenturyValue = dctCentury+""; - Logger.printDetail("dctCentury"+dctCentury); - - // Tense is FUTURE - if ((last_used_tense.equals("FUTURE")) || (last_used_tense.equals("PRESENTFUTURE"))) { - if (viThisDecade < dctDecade) { - newCenturyValue = dctCentury + 1+""; - } else { - newCenturyValue = dctCentury+""; - } - } - // Tense is PAST - if ((last_used_tense.equals("PAST"))) { - if (dctDecade < viThisDecade) { - newCenturyValue = dctCentury - 1+""; - } else { - newCenturyValue = dctCentury+""; - } + // iterate over the selected sets and merge information, remove old timexes + for (ArrayList tSet : effectivelyToInspect) { + Timex3 newTimex; + + // if a timex has the timex value REMOVE, remove it from consideration + @SuppressWarnings("unchecked") + ArrayList newTSet = (ArrayList) tSet.clone(); + for (Timex3 t : tSet) { + // remove timexes with value "REMOVE" + if (t.getTimexValue().equals("REMOVE")) { + newTSet.remove(t); } } - // NARRATIVE DOCUMENTS - else { - newCenturyValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "century", language); - if (!(newCenturyValue.startsWith("BC"))){ - if ((newCenturyValue.matches("^\\d\\d.*")) && (Integer.parseInt(newCenturyValue.substring(0, 2)) < 10)){ - newCenturyValue = "00"; + tSet = newTSet; + + // iteration is done if all the timexes have been removed, i.e. + // the set is empty + if (tSet.size() == 0) + continue; + + /* + * check - whether all timexes of this set have the same timex type attribute, - which one in the set has the longest value attribute string length, - what the combined extents + * are + */ + boolean allSameTypes = true; + String timexType = null; + Timex3 longestTimex = null; + int combinedBegin = Integer.MAX_VALUE, combinedEnd = Integer.MIN_VALUE; + ArrayList tokenIds = new ArrayList(); + for (Timex3 t : tSet) { + // check whether the types are identical and either all + // DATE or TIME + if (timexType == null) { + timexType = t.getTimexType(); + } else { + if (allSameTypes && !timexType.equals(t.getTimexType()) || !(timexType.equals("DATE") || timexType.equals("TIME"))) { + allSameTypes = false; } - }else{ - newCenturyValue = "00"; } - } - if (newCenturyValue.equals("")){ - if (!(documentTypeNarrative)) { - // always assume that sixties, twenties, and so on are 19XX if no century found (LREC change) - valueNew = ambigString.replaceFirst("UNDEF-century", "19"); + if (LOG.isTraceEnabled()) + LOG.trace("Are these overlapping timexes of same type? => {}", allSameTypes); + + // check timex value attribute string length + if (longestTimex == null) { + longestTimex = t; + } else if (allSameTypes && t.getFoundByRule().indexOf("-BCADhint") != -1) { + longestTimex = t; + } else if (allSameTypes && t.getFoundByRule().indexOf("relative") == -1 && longestTimex.getFoundByRule().indexOf("relative") != -1) { + longestTimex = t; + } else if (longestTimex.getTimexValue().length() == t.getTimexValue().length()) { + if (t.getBegin() < longestTimex.getBegin()) + longestTimex = t; + } else if (longestTimex.getTimexValue().length() < t.getTimexValue().length()) { + longestTimex = t; } - // LREC change: assume in narrative-style documents that if no other century was mentioned before, 1st century - else { - valueNew = ambigString.replaceFirst("UNDEF-century", "00"); + if (LOG.isTraceEnabled()) + LOG.trace("Selected {}: {} [{}] as the longest-valued timex.", longestTimex.getTimexId(), longestTimex.getCoveredText(), longestTimex.getTimexValue()); + + // check combined beginning/end + if (combinedBegin > t.getBegin()) + combinedBegin = t.getBegin(); + if (combinedEnd < t.getEnd()) + combinedEnd = t.getEnd(); + if (LOG.isTraceEnabled()) + LOG.trace("Selected combined constraints: {}:{}", combinedBegin, combinedEnd); + + // disassemble and remember the token ids + if (doAllTokIds) { + String[] tokenizedTokenIds = t.getAllTokIds().split("<-->"); + for (int i = 1; i < tokenizedTokenIds.length; i++) { + int tokid = parseInt(tokenizedTokenIds[i]); + if (!tokenIds.contains(tokid)) + tokenIds.add(tokid); + } } } - else { - valueNew = ambigString.replaceFirst("UNDEF-century", newCenturyValue); - } - // always assume that sixties, twenties, and so on are 19XX -- if not narrative document (LREC change) - if ((valueNew.matches("\\d\\d\\d")) && (!(documentTypeNarrative))) { - valueNew = "19" + valueNew.substring(2); + + /* + * types are equal => merge constraints, use the longer, "more granular" value. if types are not equal, just take the longest value. + */ + Collections.sort(tokenIds); + newTimex = longestTimex; + if (allSameTypes) { + newTimex.setBegin(combinedBegin); + newTimex.setEnd(combinedEnd); + if (tokenIds.size() > 0) + newTimex.setFirstTokId(tokenIds.get(0)); + String tokenIdText = "BEGIN"; + for (Integer tokenId : tokenIds) + tokenIdText += "<-->" + tokenId; + newTimex.setAllTokIds(tokenIdText); } + + // remove old overlaps. + for (Timex3 t : tSet) + t.removeFromIndexes(); + // add the single constructed/chosen timex to the indexes. + newTimex.addToIndexes(); } - - //////////////////////////////////////////////////// - // CHECK IMPLICIT EXPRESSIONS STARTING WITH UNDEF // - //////////////////////////////////////////////////// - else if (ambigString.startsWith("UNDEF")) { - valueNew = ambigString; - if (ambigString.matches("^UNDEF-REFDATE$")){ - if (i > 0){ - Timex3 anyDate = linearDates.get(i-1); - String lmDate = anyDate.getTimexValue(); - valueNew = lmDate; - } - else{ - valueNew = "XXXX-XX-XX"; - } + } - ////////////////// - // TO CALCULATE // - ////////////////// - // year to calculate - } else if (ambigString.matches("^UNDEF-(this|REFUNIT|REF)-(.*?)-(MINUS|PLUS)-([0-9]+).*")) { - for (MatchResult mr : Toolbox.findMatches(Pattern.compile("^(UNDEF-(this|REFUNIT|REF)-(.*?)-(MINUS|PLUS)-([0-9]+)).*"), ambigString)) { - String checkUndef = mr.group(1); - String ltn = mr.group(2); - String unit = mr.group(3); - String op = mr.group(4); - String sDiff = mr.group(5); - int diff = 0; - try { - diff = Integer.parseInt(sDiff); - } catch (Exception e) { - Logger.printError(component, "Expression difficult to normalize: "); - Logger.printError(component, ambigString); - Logger.printError(component, sDiff + " probably too long for parsing as integer."); - Logger.printError(component, "set normalized value as PAST_REF / FUTURE_REF:"); - if (op.equals("PLUS")){ - valueNew = "FUTURE_REF"; - } - else { - valueNew = "PAST_REF"; - } - break; - } - - - // do the processing for SCIENTIFIC documents (TPZ identification could be improved) - if ((documentTypeScientific)){ - String opSymbol = "-"; - if (op.equals("PLUS")){ - opSymbol = "+"; - } - if (unit.equals("year")){ - String diffString = diff+""; - if (diff < 10){ - diffString = "000"+diff; - } - else if (diff < 100){ - diffString = "00"+diff; - } - else if (diff < 1000){ - diffString = "0"+diff; - } - valueNew = "TPZ"+opSymbol+diffString; - } - else if (unit.equals("month")){ - String diffString = diff+""; - if (diff < 10){ - diffString = "0000-0"+diff; - } - else { - diffString = "0000-"+diff; - } - valueNew = "TPZ"+opSymbol+diffString; - } - else if (unit.equals("week")){ - String diffString = diff+""; - if (diff < 10){ - diffString = "0000-W0"+diff; - } - else { - diffString = "0000-W"+diff; - } - valueNew = "TPZ"+opSymbol+diffString; - } - else if (unit.equals("day")){ - String diffString = diff+""; - if (diff < 10){ - diffString = "0000-00-0"+diff; - } - else { - diffString = "0000-00-"+diff; - } - valueNew = "TPZ"+opSymbol+diffString; - } - else if (unit.equals("hour")){ - String diffString = diff+""; - if (diff < 10){ - diffString = "0000-00-00T0"+diff; - } - else { - diffString = "0000-00-00T"+diff; - } - valueNew = "TPZ"+opSymbol+diffString; - } - else if (unit.equals("minute")){ - String diffString = diff+""; - if (diff < 10){ - diffString = "0000-00-00T00:0"+diff; - } - else { - diffString = "0000-00-00T00:"+diff; - } - valueNew = "TPZ"+opSymbol+diffString; - } - else if (unit.equals("second")){ - String diffString = diff+""; - if (diff < 10){ - diffString = "0000-00-00T00:00:0"+diff; - } - else { - diffString = "0000-00-00T00:00:"+diff; - } - valueNew = "TPZ"+opSymbol+diffString; - } - } - else{ - - - // check for REFUNIT (only allowed for "year") - if ((ltn.equals("REFUNIT")) && (unit.equals("year"))) { - String dateWithYear = ContextAnalyzer.getLastMentionedX(linearDates, i, "dateYear", language); - String year = dateWithYear; - if (dateWithYear.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX"); - } else { - if (dateWithYear.startsWith("BC")){ - year = dateWithYear.substring(0,6); - } - else{ - year = dateWithYear.substring(0,4); - } - if (op.equals("MINUS")) { - diff = diff * (-1); - } - String yearNew = DateCalculator.getXNextYear(dateWithYear, diff); - String rest = dateWithYear.substring(4); - valueNew = valueNew.replace(checkUndef, yearNew+rest); - } - } - - - // REF and this are handled here - if (unit.equals("century")) { - if ((documentTypeNews|documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) { - int century = dctCentury; - if (op.equals("MINUS")) { - century = dctCentury - diff; - } else if (op.equals("PLUS")) { - century = dctCentury + diff; - } - valueNew = valueNew.replace(checkUndef, century+""); - } else { - String lmCentury = ContextAnalyzer.getLastMentionedX(linearDates, i, "century", language); - if (lmCentury.equals("")) { - valueNew = valueNew.replace(checkUndef, "XX"); - } else { - if (op.equals("MINUS")) { - diff = (-1) * diff; - } - lmCentury = DateCalculator.getXNextCentury(lmCentury, diff); - valueNew = valueNew.replace(checkUndef, lmCentury); - } - } - } else if (unit.equals("decade")) { - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) { - int dctDecadeLong = Integer.parseInt(dctCentury + "" + dctDecade); - int decade = dctDecadeLong; - if (op.equals("MINUS")) { - decade = dctDecadeLong - diff; - } else if (op.equals("PLUS")) { - decade = dctDecadeLong + diff; - } - valueNew = valueNew.replace(checkUndef, decade+"X"); - } else { - String lmDecade = ContextAnalyzer.getLastMentionedX(linearDates, i, "decade", language); - if (lmDecade.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXX"); - } else { - if (op.equals("MINUS")) { - diff = (-1) * diff; - } - lmDecade = DateCalculator.getXNextDecade(lmDecade, diff); - valueNew = valueNew.replace(checkUndef, lmDecade); - } - } - } else if (unit.equals("year")) { - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) { - int intValue = dctYear; - if (op.equals("MINUS")) { - intValue = dctYear - diff; - } else if (op.equals("PLUS")) { - intValue = dctYear + diff; - } - valueNew = valueNew.replace(checkUndef, intValue + ""); - } else { - String lmYear = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language); - if (lmYear.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX"); - } else { - if (op.equals("MINUS")) { - diff = (-1) * diff; - } - lmYear = DateCalculator.getXNextYear(lmYear, diff); - valueNew = valueNew.replace(checkUndef, lmYear); - } - } - // TODO BC years - } else if (unit.equals("quarter")) { - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) { - int intYear = dctYear; - int intQuarter = Integer.parseInt(dctQuarter.substring(1)); - int diffQuarters = diff % 4; - diff = diff - diffQuarters; - int diffYears = diff / 4; - if (op.equals("MINUS")) { - diffQuarters = diffQuarters * (-1); - diffYears = diffYears * (-1); - } - intYear = intYear + diffYears; - intQuarter = intQuarter + diffQuarters; - valueNew = valueNew.replace(checkUndef, intYear+"-Q"+intQuarter); - } else { - String lmQuarter = ContextAnalyzer.getLastMentionedX(linearDates, i, "quarter", language); - if (lmQuarter.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX"); - } else { - int intYear = Integer.parseInt(lmQuarter.substring(0, 4)); - int intQuarter = Integer.parseInt(lmQuarter.substring(6)); - int diffQuarters = diff % 4; - diff = diff - diffQuarters; - int diffYears = diff / 4; - if (op.equals("MINUS")) { - diffQuarters = diffQuarters * (-1); - diffYears = diffYears * (-1); - } - intYear = intYear + diffYears; - intQuarter = intQuarter + diffQuarters; - valueNew = valueNew.replace(checkUndef, intYear+"-Q"+intQuarter); - } - } - } else if (unit.equals("month")) { - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) { - if (op.equals("MINUS")) { - diff = diff * (-1); - } - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(dctYear + "-" + norm.getFromNormNumber(dctMonth+""), diff)); - } else { - String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates, i, "month", language); - if (lmMonth.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX"); - } else { - if (op.equals("MINUS")) { - diff = diff * (-1); - } - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(lmMonth, diff)); - } - } - } else if (unit.equals("week")) { - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) { - if (op.equals("MINUS")) { - diff = diff * (-1); - } else if (op.equals("PLUS")) { - // diff = diff * 7; - } - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextWeek(dctYear+"-W"+norm.getFromNormNumber(dctWeek+""), diff, language)); - } else { - String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language); - if (lmDay.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); - } else { - if (op.equals("MINUS")) { - diff = diff * 7 * (-1); - } else if (op.equals("PLUS")) { - diff = diff * 7; - } - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff)); - } - } - } else if (unit.equals("day")) { - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) { - if (op.equals("MINUS")) { - diff = diff * (-1); - } - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + norm.getFromNormNumber(dctMonth+"") + "-" + dctDay, diff)); - } else { - String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language); - if (lmDay.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); - } else { - if (op.equals("MINUS")) { - diff = diff * (-1); - } - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff)); - } - } - } - } - } - } - - // century - else if (ambigString.startsWith("UNDEF-last-century")) { - String checkUndef = "UNDEF-last-century"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, norm.getFromNormNumber(dctCentury - 1 +"")); - } else { - String lmCentury = ContextAnalyzer.getLastMentionedX(linearDates,i,"century", language); - if (lmCentury.equals("")) { - valueNew = valueNew.replace(checkUndef, "XX"); - } - else { - lmCentury = DateCalculator.getXNextCentury(lmCentury, -1); - valueNew = valueNew.replace(checkUndef, lmCentury); - } - } - } else if (ambigString.startsWith("UNDEF-this-century")) { - String checkUndef = "UNDEF-this-century"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, norm.getFromNormNumber(dctCentury+"")); - } else { - String lmCentury = ContextAnalyzer.getLastMentionedX(linearDates,i,"century", language); - if (lmCentury.equals("")) { - valueNew = valueNew.replace(checkUndef, "XX"); - } else { - valueNew = valueNew.replace(checkUndef, lmCentury); - } - } - } else if (ambigString.startsWith("UNDEF-next-century")) { - String checkUndef = "UNDEF-next-century"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, norm.getFromNormNumber(dctCentury + 1+"")); - } else { - String lmCentury = ContextAnalyzer.getLastMentionedX(linearDates,i,"century", language); - if (lmCentury.equals("")) { - valueNew = valueNew.replace(checkUndef, "XX"); - } else { - lmCentury = DateCalculator.getXNextCentury(lmCentury, +1); - valueNew = valueNew.replace(checkUndef, lmCentury); - } - } - } - - // decade - else if (ambigString.startsWith("UNDEF-last-decade")) { - String checkUndef = "UNDEF-last-decade"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, (dctYear - 10+"").substring(0,3)); - } else { - String lmDecade = ContextAnalyzer.getLastMentionedX(linearDates,i,"decade", language); - if (lmDecade.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX"); - } else { - lmDecade = DateCalculator.getXNextDecade(lmDecade, -1); - valueNew = valueNew.replace(checkUndef, lmDecade); - } - } - } else if (ambigString.startsWith("UNDEF-this-decade")) { - String checkUndef = "UNDEF-this-decade"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, (dctYear+"").substring(0,3)); - } else { - String lmDecade = ContextAnalyzer.getLastMentionedX(linearDates,i,"decade", language); - if (lmDecade.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX"); - } else { - valueNew = valueNew.replace(checkUndef, lmDecade); - } - } - } else if (ambigString.startsWith("UNDEF-next-decade")) { - String checkUndef = "UNDEF-next-decade"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, (dctYear + 10+"").substring(0,3)); - } else { - String lmDecade = ContextAnalyzer.getLastMentionedX(linearDates,i,"decade", language); - if (lmDecade.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX"); - } else { - lmDecade = DateCalculator.getXNextDecade(lmDecade, 1); - valueNew = valueNew.replace(checkUndef, lmDecade); - } - } - } - - // year - else if (ambigString.startsWith("UNDEF-last-year")) { - String checkUndef = "UNDEF-last-year"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, dctYear -1 +""); - } else { - String lmYear = ContextAnalyzer.getLastMentionedX(linearDates,i,"year", language); - if (lmYear.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX"); - } else { - lmYear = DateCalculator.getXNextYear(lmYear, -1); - valueNew = valueNew.replace(checkUndef, lmYear); - } - } - if (valueNew.endsWith("-FY")){ - valueNew = "FY" + valueNew.substring(0, Math.min(valueNew.length(), 4)); - } - } else if (ambigString.startsWith("UNDEF-this-year")) { - String checkUndef = "UNDEF-this-year"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, dctYear +""); - } else { - String lmYear = ContextAnalyzer.getLastMentionedX(linearDates,i,"year", language); - if (lmYear.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX"); - } else { - valueNew = valueNew.replace(checkUndef, lmYear); - } - } - if (valueNew.endsWith("-FY")){ - valueNew = "FY" + valueNew.substring(0, Math.min(valueNew.length(), 4)); - } - } else if (ambigString.startsWith("UNDEF-next-year")) { - String checkUndef = "UNDEF-next-year"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, dctYear +1 +""); - } else { - String lmYear = ContextAnalyzer.getLastMentionedX(linearDates,i,"year", language); - if (lmYear.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX"); - } else { - lmYear = DateCalculator.getXNextYear(lmYear, 1); - valueNew = valueNew.replace(checkUndef, lmYear); - } - } - if (valueNew.endsWith("-FY")){ - valueNew = "FY" + valueNew.substring(0, Math.min(valueNew.length(), 4)); - } - } - - // month - else if (ambigString.startsWith("UNDEF-last-month")) { - String checkUndef = "UNDEF-last-month"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(dctYear + "-" + norm.getFromNormNumber(dctMonth+""), -1)); - } else { - String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates,i,"month", language); - if (lmMonth.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX"); - } else { - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(lmMonth, -1)); - } - } - } else if (ambigString.startsWith("UNDEF-this-month")) { - String checkUndef = "UNDEF-this-month"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, dctYear + "-" + norm.getFromNormNumber(dctMonth+"")); - } else { - String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates,i,"month", language); - if (lmMonth.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX"); - } else { - valueNew = valueNew.replace(checkUndef, lmMonth); - } - } - } - else if (ambigString.startsWith("UNDEF-next-month")) { - String checkUndef = "UNDEF-next-month"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(dctYear + "-" + norm.getFromNormNumber(dctMonth+""), 1)); - } else { - String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates,i,"month", language); - if (lmMonth.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX"); - } else { - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(lmMonth, 1)); - } - } - } - - // day - else if (ambigString.startsWith("UNDEF-last-day")) { - String checkUndef = "UNDEF-last-day"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + norm.getFromNormNumber(dctMonth+"") + "-"+ dctDay, -1)); - } else { - String lmDay = ContextAnalyzer.getLastMentionedX(linearDates,i,"day", language); - if (lmDay.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); - } else { - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay,-1)); - } - } - } else if (ambigString.startsWith("UNDEF-this-day")) { - String checkUndef = "UNDEF-this-day"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, dctYear + "-" + norm.getFromNormNumber(dctMonth+"") + "-"+ norm.getFromNormNumber(dctDay+"")); - } else { - String lmDay = ContextAnalyzer.getLastMentionedX(linearDates,i,"day", language); - if (lmDay.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); - } else { - valueNew = valueNew.replace(checkUndef, lmDay); - } - if (ambigString.equals("UNDEF-this-day")) { - valueNew = "PRESENT_REF"; - } - } - } - else if (ambigString.startsWith("UNDEF-next-day")) { - String checkUndef = "UNDEF-next-day"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + norm.getFromNormNumber(dctMonth+"") + "-"+ dctDay, 1)); - } else { - String lmDay = ContextAnalyzer.getLastMentionedX(linearDates,i,"day", language); - if (lmDay.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); - } else { - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay,1)); - } - } - } - - // week - else if (ambigString.startsWith("UNDEF-last-week")) { - String checkUndef = "UNDEF-last-week"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextWeek(dctYear+"-W"+norm.getFromNormNumber(dctWeek+""),-1, language)); - } else { - String lmWeek = ContextAnalyzer.getLastMentionedX(linearDates,i,"week", language); - if (lmWeek.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-WXX"); - } else { - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextWeek(lmWeek,-1, language)); - } - } - } else if (ambigString.startsWith("UNDEF-this-week")) { - String checkUndef = "UNDEF-this-week"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef,dctYear+"-W"+norm.getFromNormNumber(dctWeek+"")); - } else { - String lmWeek = ContextAnalyzer.getLastMentionedX(linearDates,i,"week", language); - if (lmWeek.equals("")) { - valueNew = valueNew.replace(checkUndef,"XXXX-WXX"); - } else { - valueNew = valueNew.replace(checkUndef,lmWeek); - } - } - } else if (ambigString.startsWith("UNDEF-next-week")) { - String checkUndef = "UNDEF-next-week"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextWeek(dctYear+"-W"+norm.getFromNormNumber(dctWeek+""),1, language)); - } else { - String lmWeek = ContextAnalyzer.getLastMentionedX(linearDates,i,"week", language); - if (lmWeek.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-WXX"); - } else { - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextWeek(lmWeek,1, language)); - } - } - } - - // quarter - else if (ambigString.startsWith("UNDEF-last-quarter")) { - String checkUndef = "UNDEF-last-quarter"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - if (dctQuarter.equals("Q1")) { - valueNew = valueNew.replace(checkUndef, dctYear-1+"-Q4"); - } else { - int newQuarter = Integer.parseInt(dctQuarter.substring(1,2))-1; - valueNew = valueNew.replace(checkUndef, dctYear+"-Q"+newQuarter); - } - } else { - String lmQuarter = ContextAnalyzer.getLastMentionedX(linearDates, i, "quarter", language); - if (lmQuarter.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-QX"); - } else { - int lmQuarterOnly = Integer.parseInt(lmQuarter.substring(6,7)); - int lmYearOnly = Integer.parseInt(lmQuarter.substring(0,4)); - if (lmQuarterOnly == 1) { - valueNew = valueNew.replace(checkUndef, lmYearOnly-1+"-Q4"); - } else { - int newQuarter = lmQuarterOnly-1; - valueNew = valueNew.replace(checkUndef, lmYearOnly+"-Q"+newQuarter); - } - } - } - } else if (ambigString.startsWith("UNDEF-this-quarter")) { - String checkUndef = "UNDEF-this-quarter"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, dctYear+"-"+dctQuarter); - } else { - String lmQuarter = ContextAnalyzer.getLastMentionedX(linearDates, i, "quarter", language); - if (lmQuarter.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-QX"); - } else { - valueNew = valueNew.replace(checkUndef, lmQuarter); - } - } - } else if (ambigString.startsWith("UNDEF-next-quarter")) { - String checkUndef = "UNDEF-next-quarter"; - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - if (dctQuarter.equals("Q4")) { - valueNew = valueNew.replace(checkUndef, dctYear+1+"-Q1"); - } else { - int newQuarter = Integer.parseInt(dctQuarter.substring(1,2))+1; - valueNew = valueNew.replace(checkUndef, dctYear+"-Q"+newQuarter); - } - } else { - String lmQuarter = ContextAnalyzer.getLastMentionedX(linearDates, i, "quarter", language); - if (lmQuarter.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-QX"); - } else { - int lmQuarterOnly = Integer.parseInt(lmQuarter.substring(6,7)); - int lmYearOnly = Integer.parseInt(lmQuarter.substring(0,4)); - if (lmQuarterOnly == 4) { - valueNew = valueNew.replace(checkUndef, lmYearOnly+1+"-Q1"); - } else { - int newQuarter = lmQuarterOnly+1; - valueNew = valueNew.replace(checkUndef, lmYearOnly+"-Q"+newQuarter); - } - } - } - } - - // MONTH NAMES - else if (ambigString.matches("UNDEF-(last|this|next)-(january|february|march|april|may|june|july|august|september|october|november|december).*")) { - for (MatchResult mr : Toolbox.findMatches(Pattern.compile("(UNDEF-(last|this|next)-(january|february|march|april|may|june|july|august|september|october|november|december))(.*)"),ambigString)) { - String rest = mr.group(4); - int day = 0; - for (MatchResult mr_rest : Toolbox.findMatches(Pattern.compile("-([0-9][0-9])"),rest)){ - day = Integer.parseInt(mr_rest.group(1)); - } - String checkUndef = mr.group(1); - String ltn = mr.group(2); - String newMonth = norm.getFromNormMonthName((mr.group(3))); - int newMonthInt = Integer.parseInt(newMonth); - if (ltn.equals("last")) { - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - // check day if dct-month and newMonth are equal - if ((dctMonth == newMonthInt) && (!(day == 0))){ - if (dctDay > day){ - valueNew = valueNew.replace(checkUndef, dctYear+"-"+newMonth); - } - else{ - valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newMonth); - } - } - else if (dctMonth <= newMonthInt) { - valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newMonth); - } else { - valueNew = valueNew.replace(checkUndef, dctYear+"-"+newMonth); - } - } else { - String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates, i, "month-with-details", language); - if (lmMonth.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX"); - } else { - int lmMonthInt = Integer.parseInt(lmMonth.substring(5,7)); - // - int lmDayInt = 0; - if ((lmMonth.length() > 9) && (lmMonth.subSequence(8,10).toString().matches("\\d\\d"))){ - lmDayInt = Integer.parseInt(lmMonth.subSequence(8,10)+""); - } - if ((lmMonthInt == newMonthInt) && (!(lmDayInt == 0)) && (!(day == 0))){ - if (lmDayInt > day){ - valueNew = valueNew.replace(checkUndef, lmMonth.substring(0,4)+"-"+newMonth); - } - else{ - valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmMonth.substring(0,4))-1+"-"+newMonth); - } - } - if (lmMonthInt <= newMonthInt) { - valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmMonth.substring(0,4))-1+"-"+newMonth); - } else { - valueNew = valueNew.replace(checkUndef, lmMonth.substring(0,4)+"-"+newMonth); - } - } - } - } else if (ltn.equals("this")) { - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - valueNew = valueNew.replace(checkUndef, dctYear+"-"+newMonth); - } else { - String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates, i, "month-with-details", language); - if (lmMonth.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX"); - } else { - valueNew = valueNew.replace(checkUndef, lmMonth.substring(0,4)+"-"+newMonth); - } - } - } else if (ltn.equals("next")) { - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - // check day if dct-month and newMonth are equal - if ((dctMonth == newMonthInt) && (!(day == 0))){ - if (dctDay < day){ - valueNew = valueNew.replace(checkUndef, dctYear+"-"+newMonth); - } - else{ - valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newMonth); - } - } - else if (dctMonth >= newMonthInt) { - valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newMonth); - } else { - valueNew = valueNew.replace(checkUndef, dctYear+"-"+newMonth); - } - } else { - String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates, i, "month-with-details", language); - if (lmMonth.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX"); - } else { - int lmMonthInt = Integer.parseInt(lmMonth.substring(5,7)); - if (lmMonthInt >= newMonthInt) { - valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmMonth.substring(0,4))+1+"-"+newMonth); - } else { - valueNew = valueNew.replace(checkUndef, lmMonth.substring(0,4)+"-"+newMonth); - } - } - } - } - } - } - - // SEASONS NAMES - else if (ambigString.matches("^UNDEF-(last|this|next)-(SP|SU|FA|WI).*")) { - for (MatchResult mr : Toolbox.findMatches(Pattern.compile("(UNDEF-(last|this|next)-(SP|SU|FA|WI)).*"),ambigString)) { - String checkUndef = mr.group(1); - String ltn = mr.group(2); - String newSeason = mr.group(3); - if (ltn.equals("last")) { - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - if (dctSeason.equals("SP")) { - valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newSeason); - } else if (dctSeason.equals("SU")) { - if (newSeason.equals("SP")) { - valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason); - } else { - valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newSeason); - } - } else if (dctSeason.equals("FA")) { - if ((newSeason.equals("SP")) || (newSeason.equals("SU"))) { - valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason); - } else { - valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newSeason); - } - } else if (dctSeason.equals("WI")) { - if (newSeason.equals("WI")) { - valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newSeason); - } else { - if (dctMonth < 12){ - valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newSeason); - } - else{ - valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason); - } - } - } - } else { // NARRATVIE DOCUMENT - String lmSeason = ContextAnalyzer.getLastMentionedX(linearDates, i, "season", language); - if (lmSeason.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX"); - } else { - if (lmSeason.substring(5,7).equals("SP")) { - valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))-1+"-"+newSeason); - } else if (lmSeason.substring(5,7).equals("SU")) { - if (lmSeason.substring(5,7).equals("SP")) { - valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason); - } else { - valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))-1+"-"+newSeason); - } - } else if (lmSeason.substring(5,7).equals("FA")) { - if ((newSeason.equals("SP")) || (newSeason.equals("SU"))) { - valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason); - } else { - valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))-1+"-"+newSeason); - } - } else if (lmSeason.substring(5,7).equals("WI")) { - if (newSeason.equals("WI")) { - valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))-1+"-"+newSeason); - } else { - valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason); - } - } - } - } - } else if (ltn.equals("this")) { - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - // TODO include tense of sentence? - valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason); - } else { - // TODO include tense of sentence? - String lmSeason = ContextAnalyzer.getLastMentionedX(linearDates, i, "season", language); - if (lmSeason.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX"); - } else { - valueNew = valueNew.replace(checkUndef, lmSeason.substring(0,4)+"-"+newSeason); - } - } - } else if (ltn.equals("next")) { - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - if (dctSeason.equals("SP")) { - if (newSeason.equals("SP")) { - valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newSeason); - } else { - valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason); - } - } else if (dctSeason.equals("SU")) { - if ((newSeason.equals("SP")) || (newSeason.equals("SU"))) { - valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newSeason); - } else { - valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason); - } - } else if (dctSeason.equals("FA")) { - if (newSeason.equals("WI")) { - valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason); - } else { - valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newSeason); - } - } else if (dctSeason.equals("WI")) { - valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newSeason); - } - } else { // NARRATIVE DOCUMENT - String lmSeason = ContextAnalyzer.getLastMentionedX(linearDates, i, "season", language); - if (lmSeason.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX"); - } else { - if (lmSeason.substring(5,7).equals("SP")) { - if (newSeason.equals("SP")) { - valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+1+"-"+newSeason); - } else { - valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason); - } - } else if (lmSeason.substring(5,7).equals("SU")) { - if ((newSeason.equals("SP")) || (newSeason.equals("SU"))) { - valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+1+"-"+newSeason); - } else { - valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason); - } - } else if (lmSeason.substring(5,7).equals("FA")) { - if (newSeason.equals("WI")) { - valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason); - } else { - valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+1+"-"+newSeason); - } - } else if (lmSeason.substring(5,7).equals("WI")) { - valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+1+"-"+newSeason); - } - } - } - } - } - } - - // WEEKDAY NAMES - // TODO the calculation is strange, but works - // TODO tense should be included?! - else if (ambigString.matches("^UNDEF-(last|this|next|day)-(monday|tuesday|wednesday|thursday|friday|saturday|sunday).*")) { - for (MatchResult mr : Toolbox.findMatches(Pattern.compile("(UNDEF-(last|this|next|day)-(monday|tuesday|wednesday|thursday|friday|saturday|sunday)).*"),ambigString)) { - String checkUndef = mr.group(1); - String ltnd = mr.group(2); - String newWeekday = mr.group(3); - int newWeekdayInt = Integer.parseInt(norm.getFromNormDayInWeek(newWeekday)); - if (ltnd.equals("last")) { - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - int diff = (-1) * (dctWeekday - newWeekdayInt); - if (diff >= 0) { - diff = diff - 7; - } - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + dctMonth + "-" + dctDay, diff)); - } else { - String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language); - if (lmDay.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); - } else { - int lmWeekdayInt = DateCalculator.getWeekdayOfDate(lmDay); - int diff = (-1) * (lmWeekdayInt - newWeekdayInt); - if (diff >= 0) { - diff = diff - 7; - } - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff)); - } - } - } else if (ltnd.equals("this")) { - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - // TODO tense should be included?! - int diff = (-1) * (dctWeekday - newWeekdayInt); - if (diff >= 0) { - diff = diff - 7; - } - if (diff == -7) { - diff = 0; - } - - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + dctMonth + "-"+ dctDay, diff)); - } else { - // TODO tense should be included?! - String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language); - if (lmDay.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); - } else { - int lmWeekdayInt = DateCalculator.getWeekdayOfDate(lmDay); - int diff = (-1) * (lmWeekdayInt - newWeekdayInt); - if (diff >= 0) { - diff = diff - 7; - } - if (diff == -7) { - diff = 0; - } - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff)); - } - } - } else if (ltnd.equals("next")) { - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - int diff = newWeekdayInt - dctWeekday; - if (diff <= 0) { - diff = diff + 7; - } - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + dctMonth + "-"+ dctDay, diff)); - } else { - String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language); - if (lmDay.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); - } else { - int lmWeekdayInt = DateCalculator.getWeekdayOfDate(lmDay); - int diff = newWeekdayInt - lmWeekdayInt; - if (diff <= 0) { - diff = diff + 7; - } - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff)); - } - } - } else if (ltnd.equals("day")) { - if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) { - // TODO tense should be included?! - int diff = (-1) * (dctWeekday - newWeekdayInt); - if (diff >= 0) { - diff = diff - 7; - } - if (diff == -7) { - diff = 0; - } - // Tense is FUTURE - if ((last_used_tense.equals("FUTURE")) && diff != 0) { - diff = diff + 7; - } - // Tense is PAST - if ((last_used_tense.equals("PAST"))) { - - } - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + dctMonth + "-"+ dctDay, diff)); - } else { - // TODO tense should be included?! - String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language); - if (lmDay.equals("")) { - valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX"); - } else { - int lmWeekdayInt = DateCalculator.getWeekdayOfDate(lmDay); - int diff = (-1) * (lmWeekdayInt - newWeekdayInt); - if (diff >= 0) { - diff = diff - 7; - } - if (diff == -7) { - diff = 0; - } - valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff)); - } - } - } - } - - } else { - Logger.printDetail(component, "ATTENTION: UNDEF value for: " + valueNew+" is not handled in disambiguation phase!"); - } - } - - return valueNew; - } - - /** - * Under-specified values are disambiguated here. Only Timexes of types "date" and "time" can be under-specified. - * @param jcas - */ - public void specifyAmbiguousValues(JCas jcas) { - // build up a list with all found TIMEX expressions - List linearDates = new ArrayList(); - FSIterator iterTimex = jcas.getAnnotationIndex(Timex3.type).iterator(); - - // Create List of all Timexes of types "date" and "time" - while (iterTimex.hasNext()) { - Timex3 timex = (Timex3) iterTimex.next(); - if (timex.getTimexType().equals("DATE") || timex.getTimexType().equals("TIME")) { - linearDates.add(timex); - } - - if(timex.getTimexType().equals("DURATION") && !timex.getEmptyValue().equals("")) { - linearDates.add(timex); - } - } - - ////////////////////////////////////////////// - // go through list of Date and Time timexes // - ////////////////////////////////////////////// - for (int i = 0; i < linearDates.size(); i++) { - Timex3 t_i = (Timex3) linearDates.get(i); - String value_i = t_i.getTimexValue(); - - String valueNew = value_i; - // handle the value attribute only if we have a TIME or DATE - if(t_i.getTimexType().equals("TIME") || t_i.getTimexType().equals("DATE")) - valueNew = specifyAmbiguousValuesString(value_i, t_i, i, linearDates, jcas); - - // handle the emptyValue attribute for any type - if(t_i.getEmptyValue() != null && t_i.getEmptyValue().length() > 0) { - String emptyValueNew = specifyAmbiguousValuesString(t_i.getEmptyValue(), t_i, i, linearDates, jcas); - t_i.setEmptyValue(emptyValueNew); - } - - t_i.removeFromIndexes(); - Logger.printDetail(t_i.getTimexId()+" DISAMBIGUATION PHASE: foundBy:"+t_i.getFoundByRule()+" text:"+t_i.getCoveredText()+" value:"+t_i.getTimexValue()+" NEW value:"+valueNew); - - t_i.setTimexValue(valueNew); - t_i.addToIndexes(); - linearDates.set(i, t_i); - } - } - - - /** - * @param jcas - */ - private void deleteOverlappedTimexesPreprocessing(JCas jcas) { - FSIterator timexIter1 = jcas.getAnnotationIndex(Timex3.type).iterator(); - HashSet hsTimexesToRemove = new HashSet(); - while (timexIter1.hasNext()) { - Timex3 t1 = (Timex3) timexIter1.next(); - FSIterator timexIter2 = jcas.getAnnotationIndex(Timex3.type).iterator(); - - while (timexIter2.hasNext()) { - Timex3 t2 = (Timex3) timexIter2.next(); - if (((t1.getBegin() >= t2.getBegin()) && (t1.getEnd() < t2.getEnd())) || // t1 starts inside or with t2 and ends before t2 -> remove t1 - ((t1.getBegin() > t2.getBegin()) && (t1.getEnd() <= t2.getEnd()))) { // t1 starts inside t2 and ends with or before t2 -> remove t1 - hsTimexesToRemove.add(t1); - } - else if (((t2.getBegin() >= t1.getBegin()) && (t2.getEnd() < t1.getEnd())) || // t2 starts inside or with t1 and ends before t1 -> remove t2 - ((t2.getBegin() > t1.getBegin()) && (t2.getEnd() <= t1.getEnd()))) { // t2 starts inside t1 and ends with or before t1 -> remove t2 - hsTimexesToRemove.add(t2); - } - // identical length - if (!t1.equals(t2) && (t1.getBegin() == t2.getBegin()) && (t1.getEnd() == t2.getEnd())) { - if ((t1.getTimexValue().startsWith("UNDEF")) && (!(t2.getTimexValue().startsWith("UNDEF")))) { - hsTimexesToRemove.add(t1); - } - else if ((!(t1.getTimexValue().startsWith("UNDEF"))) && (t2.getTimexValue().startsWith("UNDEF"))) { - hsTimexesToRemove.add(t2); - } - // t1 is explicit, but t2 is not - else if ((t1.getFoundByRule().endsWith("explicit")) && (!(t2.getFoundByRule().endsWith("explicit")))) { - hsTimexesToRemove.add(t2); - } - // remove timexes that are identical, but one has an emptyvalue - else if(t2.getEmptyValue().equals("") && !t1.getEmptyValue().equals("")) { - hsTimexesToRemove.add(t2); - } - // REMOVE REAL DUPLICATES (the one with the lower timexID) - else if ((Integer.parseInt(t1.getTimexId().substring(1)) < Integer.parseInt(t2.getTimexId().substring(1)))) { - hsTimexesToRemove.add(t1); - } - } - } - } - // remove, finally - for (Timex3 t : hsTimexesToRemove) { - Logger.printDetail("REMOVE DUPLICATE: " + t.getCoveredText()+"(id:"+t.getTimexId()+" value:"+t.getTimexValue()+" found by:"+t.getFoundByRule()+")"); - - t.removeFromIndexes(); - timex_counter--; - } - } - - private void deleteOverlappedTimexesPostprocessing(JCas jcas) { - FSIterator timexIter = jcas.getAnnotationIndex(Timex3.type).iterator(); - FSIterator innerTimexIter = timexIter.copy(); - HashSet> effectivelyToInspect = new HashSet>(); - ArrayList allTimexesToInspect = new ArrayList(); - while(timexIter.hasNext()) { - Timex3 myTimex = (Timex3) timexIter.next(); - - ArrayList timexSet = new ArrayList(); - if (!(myTimex.getTimexType().equals("TEMPONYM"))) { - timexSet.add(myTimex); - } - - // compare this timex to all other timexes and mark those that have an overlap - while(innerTimexIter.hasNext()) { - Timex3 myInnerTimex = (Timex3) innerTimexIter.next(); - if (!(myTimex.getTimexType().equals("TEMPONYM"))) { - if((myTimex.getBegin() <= myInnerTimex.getBegin() && myTimex.getEnd() > myInnerTimex.getBegin()) || // timex1 starts, timex2 is partial overlap - (myInnerTimex.getBegin() <= myTimex.getBegin() && myInnerTimex.getEnd() > myTimex.getBegin()) || // same as above, but in reverse - (myInnerTimex.getBegin() <= myTimex.getBegin() && myTimex.getEnd() <= myInnerTimex.getEnd()) || // timex 1 is contained within or identical to timex2 - (myTimex.getBegin() <= myInnerTimex.getBegin() && myInnerTimex.getEnd() <= myTimex.getEnd())) { // same as above, but in reverse - timexSet.add(myInnerTimex); // increase the set - - allTimexesToInspect.add(myTimex); // note that these timexes are being looked at - allTimexesToInspect.add(myInnerTimex); - } - } - } - - // if overlaps with myTimex were detected, memorize them - if(timexSet.size() > 1) - effectivelyToInspect.add(timexSet); - - // reset the inner iterator - innerTimexIter.moveToFirst(); - } - - /* prune those sets of overlapping timexes that are subsets of others - * (i.e. leave only the largest union of overlapping timexes) - */ - HashSet> newEffectivelyToInspect = new HashSet>(); - for(Timex3 t : allTimexesToInspect) { - ArrayList setToKeep = new ArrayList(); - - // determine the largest set that contains this timex - for(ArrayList tSet : effectivelyToInspect) { - if(tSet.contains(t) && tSet.size() > setToKeep.size()) - setToKeep = tSet; - } - - newEffectivelyToInspect.add(setToKeep); - } - // overwrite previous list of sets - effectivelyToInspect = newEffectivelyToInspect; - - // iterate over the selected sets and merge information, remove old timexes - for(ArrayList tSet : effectivelyToInspect) { - Timex3 newTimex = new Timex3(jcas); - - // if a timex has the timex value REMOVE, remove it from consideration - @SuppressWarnings("unchecked") - ArrayList newTSet = (ArrayList) tSet.clone(); - for(Timex3 t : tSet) { - if(t.getTimexValue().equals("REMOVE")) { // remove timexes with value "REMOVE" - newTSet.remove(t); - } - } - tSet = newTSet; - - // iteration is done if all the timexes have been removed, i.e. the set is empty - if(tSet.size() == 0) - continue; - - /* - * check - * - whether all timexes of this set have the same timex type attribute, - * - which one in the set has the longest value attribute string length, - * - what the combined extents are - */ - Boolean allSameTypes = true; - String timexType = null; - Timex3 longestTimex = null; - Integer combinedBegin = Integer.MAX_VALUE, combinedEnd = Integer.MIN_VALUE; - ArrayList tokenIds = new ArrayList(); - for(Timex3 t : tSet) { - // check whether the types are identical and either all DATE or TIME - if(timexType == null) { - timexType = t.getTimexType(); - } else { - if(allSameTypes && !timexType.equals(t.getTimexType()) || !(timexType.equals("DATE") || timexType.equals("TIME"))) { - allSameTypes = false; - } - } - Logger.printDetail("Are these overlapping timexes of same type? => " + allSameTypes); - - // check timex value attribute string length - if(longestTimex == null) { - longestTimex = t; - } else if(allSameTypes && t.getFoundByRule().indexOf("-BCADhint") != -1) { - longestTimex = t; - } else if(allSameTypes && t.getFoundByRule().indexOf("relative") == -1 && longestTimex.getFoundByRule().indexOf("relative") != -1) { - longestTimex = t; - } else if(longestTimex.getTimexValue().length() == t.getTimexValue().length()) { - if(t.getBegin() < longestTimex.getBegin()) - longestTimex = t; - } else if(longestTimex.getTimexValue().length() < t.getTimexValue().length()) { - longestTimex = t; - } - Logger.printDetail("Selected " + longestTimex.getTimexId() + ": " + longestTimex.getCoveredText() + - "[" + longestTimex.getTimexValue() + "] as the longest-valued timex."); - - // check combined beginning/end - if(combinedBegin > t.getBegin()) - combinedBegin = t.getBegin(); - if(combinedEnd < t.getEnd()) - combinedEnd = t.getEnd(); - Logger.printDetail("Selected combined constraints: " + combinedBegin + ":" + combinedEnd); - - // disassemble and remember the token ids - String[] tokenizedTokenIds = t.getAllTokIds().split("<-->"); - for(Integer i = 1; i < tokenizedTokenIds.length; i++) { - if(!tokenIds.contains(Integer.parseInt(tokenizedTokenIds[i]))) { - tokenIds.add(Integer.parseInt(tokenizedTokenIds[i])); - } - } - } - - /* types are equal => merge constraints, use the longer, "more granular" value. - * if types are not equal, just take the longest value. - */ - Collections.sort(tokenIds); - newTimex = longestTimex; - if(allSameTypes) { - newTimex.setBegin(combinedBegin); - newTimex.setEnd(combinedEnd); - if(tokenIds.size() > 0) - newTimex.setFirstTokId(tokenIds.get(0)); - String tokenIdText = "BEGIN"; - for(Integer tokenId : tokenIds) { - tokenIdText += "<-->" + tokenId; - } - newTimex.setAllTokIds(tokenIdText); - } - - // remove old overlaps. - for(Timex3 t : tSet) { - t.removeFromIndexes(); - } - // add the single constructed/chosen timex to the indexes. - newTimex.addToIndexes(); - } - } - - /** * Identify the part of speech (POS) of a MarchResult. + * * @param tokBegin * @param tokEnd * @param s @@ -2129,9 +711,9 @@ private void deleteOverlappedTimexesPostprocessing(JCas jcas) { public String getPosFromMatchResult(int tokBegin, int tokEnd, Sentence s, JCas jcas) { // get all tokens in sentence HashMap hmTokens = new HashMap(); - FSIterator iterTok = jcas.getAnnotationIndex(Token.type).subiterator(s); - while (iterTok.hasNext()) { - Token token = (Token) iterTok.next(); + AnnotationIndex tokens = jcas.getAnnotationIndex(Token.type); + for (FSIterator iterTok = tokens.subiterator(s); iterTok.hasNext();) { + Token token = iterTok.next(); hmTokens.put(token.getBegin(), token); } // get correct token @@ -2143,424 +725,153 @@ public String getPosFromMatchResult(int tokBegin, int tokEnd, Sentence s, JCas j return pos; } - + // pattern for offset information + Pattern paOffset = Pattern.compile("group\\(([0-9]+)\\)-group\\(([0-9]+)\\)"); + /** * Apply the extraction rules, normalization rules + * * @param timexType - * @param hmPattern - * @param hmOffset - * @param hmNormalization - * @param s + * Type to find + * @param sortedRules + * sorted rules + * @param startpos + * Valid starting positions + * @param endpos + * Valid end positions + * @param s + * Sentence * @param jcas + * JCas + * @param coveredText + * covered text */ - public void findTimexes(String timexType, - HashMap hmPattern, - HashMap hmOffset, - HashMap hmNormalization, - Sentence s, - JCas jcas) { - RuleManager rm = RuleManager.getInstance(language, find_temponyms); - HashMap hmDatePosConstraint = rm.getHmDatePosConstraint(); - HashMap hmDurationPosConstraint = rm.getHmDurationPosConstraint(); - HashMap hmTimePosConstraint = rm.getHmTimePosConstraint(); - HashMap hmSetPosConstraint = rm.getHmSetPosConstraint(); - HashMap hmTemponymPosConstraint = rm.getHmTemponymPosConstraint(); - - // get fast check patterns first - HashMap hmDateFastCheck = rm.getHmDateFastCheck(); - HashMap hmDurationFastCheck = rm.getHmDurationFastCheck(); - HashMap hmTimeFastCheck = rm.getHmTimeFastCheck(); - HashMap hmSetFastCheck = rm.getHmSetFastCheck(); - HashMap hmTemponymFastCheck = rm.getHmTemponymFastCheck(); - Pattern f = null; - Boolean fastCheckOK = true; - + public void findTimexes(String timexType, List sortedRules, TokenBoundaryMatcher matcher, Sentence s, JCas jcas, CharSequence coveredText) { // Iterator over the rules by sorted by the name of the rules - // this is important since later, the timexId will be used to + // this is important since later, the timexId will be used to // decide which of two expressions shall be removed if both // have the same offset - for (Iterator i = Toolbox.sortByValue(hmPattern).iterator(); i.hasNext(); ) { - Pattern p = (Pattern) i.next(); - - // validate fast check fist, if no fast match, everything else is not required anymore - if (timexType.equals("DATE")) { - f = hmDateFastCheck.get(hmPattern.get(p)); - } else if (timexType.equals("Time")) { - f = hmTimeFastCheck.get(hmPattern.get(p)); - } else if (timexType.equals("DURATION")) { - f = hmDurationFastCheck.get(hmPattern.get(p)); - } else if (timexType.equals("SET")) { - f = hmSetFastCheck.get(hmPattern.get(p)); - } else if (timexType.equals("TEMPONYM")) { - f = hmTemponymFastCheck.get(hmPattern.get(p)); - } - if (!(f == null)){ - fastCheckOK = false; - - if (f.matcher(s.getCoveredText()).find()) { - fastCheckOK = true; - } - } - - - if (fastCheckOK) { - for (MatchResult r : Toolbox.findMatches(p, s.getCoveredText())) { - boolean infrontBehindOK = ContextAnalyzer.checkTokenBoundaries(r, s, jcas) // improved token boundary checking - && ContextAnalyzer.checkInfrontBehind(r, s); - - - // CHECK POS CONSTRAINTS - boolean posConstraintOK = true; - - if (timexType.equals("DATE")) { - if (hmDatePosConstraint.containsKey(hmPattern.get(p))) { - posConstraintOK = checkPosConstraint(s , hmDatePosConstraint.get(hmPattern.get(p)), r, jcas); - } - } else if (timexType.equals("DURATION")) { - if (hmDurationPosConstraint.containsKey(hmPattern.get(p))) { - posConstraintOK = checkPosConstraint(s , hmDurationPosConstraint.get(hmPattern.get(p)), r, jcas); - } - } else if (timexType.equals("TIME")) { - if (hmTimePosConstraint.containsKey(hmPattern.get(p))) { - posConstraintOK = checkPosConstraint(s , hmTimePosConstraint.get(hmPattern.get(p)), r, jcas); - } - } else if (timexType.equals("SET")) { - if (hmSetPosConstraint.containsKey(hmPattern.get(p))) { - posConstraintOK = checkPosConstraint(s , hmSetPosConstraint.get(hmPattern.get(p)), r, jcas); - } - } else if (timexType.equals("TEMPONYM")) { - if (hmTemponymPosConstraint.containsKey(hmPattern.get(p))) { - posConstraintOK = checkPosConstraint(s , hmSetPosConstraint.get(hmPattern.get(p)), r, jcas); - } - } - - if ((infrontBehindOK == true) && (posConstraintOK == true)) { - - // Offset of timex expression (in the checked sentence) - int timexStart = r.start(); - int timexEnd = r.end(); - - // Normalization from Files: - - // Any offset parameter? - if (hmOffset.containsKey(hmPattern.get(p))) { - String offset = hmOffset.get(hmPattern.get(p)); - - // pattern for offset information - Pattern paOffset = Pattern.compile("group\\(([0-9]+)\\)-group\\(([0-9]+)\\)"); - for (MatchResult mr : Toolbox.findMatches(paOffset,offset)) { - int startOffset = Integer.parseInt(mr.group(1)); - int endOffset = Integer.parseInt(mr.group(2)); - timexStart = r.start(startOffset); - timexEnd = r.end(endOffset); - } - } - - // Normalization Parameter - if (hmNormalization.containsKey(hmPattern.get(p))) { - String[] attributes = new String[5]; - if (timexType.equals("DATE")) { - attributes = getAttributesForTimexFromFile(hmPattern.get(p), rm.getHmDateNormalization(), rm.getHmDateQuant(), rm.getHmDateFreq(), rm.getHmDateMod(), rm.getHmDateEmptyValue(), r, jcas); - } else if (timexType.equals("DURATION")) { - attributes = getAttributesForTimexFromFile(hmPattern.get(p), rm.getHmDurationNormalization(), rm.getHmDurationQuant(), rm.getHmDurationFreq(), rm.getHmDurationMod(), rm.getHmDurationEmptyValue(), r, jcas); - } else if (timexType.equals("TIME")) { - attributes = getAttributesForTimexFromFile(hmPattern.get(p), rm.getHmTimeNormalization(), rm.getHmTimeQuant(), rm.getHmTimeFreq(), rm.getHmTimeMod(), rm.getHmTimeEmptyValue(), r, jcas); - } else if (timexType.equals("SET")) { - attributes = getAttributesForTimexFromFile(hmPattern.get(p), rm.getHmSetNormalization(), rm.getHmSetQuant(), rm.getHmSetFreq(), rm.getHmSetMod(), rm.getHmSetEmptyValue(), r, jcas); - } else if (timexType.equals("TEMPONYM")) { - attributes = getAttributesForTimexFromFile(hmPattern.get(p), rm.getHmTemponymNormalization(), rm.getHmTemponymQuant(), rm.getHmTemponymFreq(), rm.getHmTemponymMod(), rm.getHmTemponymEmptyValue(), r, jcas); - } - if (!(attributes == null)) { - addTimexAnnotation(timexType, timexStart + s.getBegin(), timexEnd + s.getBegin(), s, - attributes[0], attributes[1], attributes[2], attributes[3], attributes[4], "t" + timexID++, hmPattern.get(p), jcas); - } - } - else { - Logger.printError("SOMETHING REALLY WRONG HERE: "+hmPattern.get(p)); - } + for (Rule rule : sortedRules) { + String key = rule.getName(); + // validate fast check first, if no fast match, everything else is + // not required anymore + Pattern f = rule.getFastCheck(); + if (f != null && matcher.matchNext(0, f.matcher(coveredText), key) < 0) + continue; + + Matcher m = rule.getPattern().matcher(coveredText); + for (int tpos = 0; (tpos = matcher.matchNext(tpos, m, key)) >= 0;) { + // CHECK POS CONSTRAINTS + String constraint = rule.getPosConstratint(); + if (constraint != null && !checkPosConstraint(key, s, constraint, m, jcas)) + continue; + // Offset of timex expression (in the checked sentence) + int timexStart = m.start(), timexEnd = m.end(); + + // Any offset parameter? + String offset = rule.getOffset(); + if (offset != null) { + Matcher mr = paOffset.matcher(offset); + if (mr.matches()) { + timexStart = m.start(parseInt(mr.group(1))); + timexEnd = m.end(parseInt(mr.group(2))); + } else { + LOG.warn("Offset pattern does not match: {}", offset); } } + + // Normalization Parameter + if (rule.getNormalization() == null) { + LOG.warn("No normalization pattern for: {}", key); + continue; + } + String[] attributes = getAttributesForTimexFromFile(key, rule, m, jcas); + if (attributes != null) { + addTimexAnnotation(timexType, timexStart + s.getBegin(), timexEnd + s.getBegin(), s, attributes[0], attributes[1], attributes[2], attributes[3], attributes[4], + "t" + timexID++, key, jcas); + } } - fastCheckOK = true; } } - - + + static Pattern paConstraint = Pattern.compile("group\\(([0-9]+)\\):(.*?):"); + /** * Check whether the part of speech constraint defined in a rule is satisfied. + * + * @param rule + * Rule name, for error reporting * @param s * @param posConstraint * @param m * @param jcas * @return */ - public boolean checkPosConstraint(Sentence s, String posConstraint, MatchResult m, JCas jcas) { - Pattern paConstraint = Pattern.compile("group\\(([0-9]+)\\):(.*?):"); - for (MatchResult mr : Toolbox.findMatches(paConstraint,posConstraint)) { - int groupNumber = Integer.parseInt(mr.group(1)); - int tokenBegin = s.getBegin() + m.start(groupNumber); - int tokenEnd = s.getBegin() + m.end(groupNumber); - String pos = mr.group(2); - String pos_as_is = getPosFromMatchResult(tokenBegin, tokenEnd ,s, jcas); - if (pos_as_is.matches(pos)) { - Logger.printDetail("POS CONSTRAINT IS VALID: pos should be "+pos+" and is "+pos_as_is); - } else { - return false; + public boolean checkPosConstraint(String rule, Sentence s, String posConstraint, MatchResult m, JCas jcas) { + Matcher mr = paConstraint.matcher(posConstraint); + while (mr.find()) { + try { + int groupNumber = parseInt(mr.group(1)); + int tokenBegin = s.getBegin() + m.start(groupNumber); + int tokenEnd = s.getBegin() + m.end(groupNumber); + String pos = mr.group(2); + String pos_as_is = getPosFromMatchResult(tokenBegin, tokenEnd, s, jcas); + if (!pos_as_is.matches(pos)) + return false; + if (LOG.isTraceEnabled()) + LOG.trace("POS CONSTRAINT IS VALID: pos should be {} and is {}", pos, pos_as_is); + } catch (IndexOutOfBoundsException e) { + LOG.debug("Bad group number in rule {}", rule); } } return true; } - - - public String applyRuleFunctions(String tonormalize, MatchResult m) { - NormalizationManager norm = NormalizationManager.getInstance(language, find_temponyms); - - String normalized = ""; - // pattern for normalization functions + group information - // pattern for group information - Pattern paNorm = Pattern.compile("%([A-Za-z0-9]+?)\\(group\\(([0-9]+)\\)\\)"); - Pattern paGroup = Pattern.compile("group\\(([0-9]+)\\)"); - while ((tonormalize.contains("%")) || (tonormalize.contains("group"))) { - // replace normalization functions - for (MatchResult mr : Toolbox.findMatches(paNorm,tonormalize)) { - Logger.printDetail("-----------------------------------"); - Logger.printDetail("DEBUGGING: tonormalize:"+tonormalize); - Logger.printDetail("DEBUGGING: mr.group():"+mr.group()); - Logger.printDetail("DEBUGGING: mr.group(1):"+mr.group(1)); - Logger.printDetail("DEBUGGING: mr.group(2):"+mr.group(2)); - Logger.printDetail("DEBUGGING: m.group():"+m.group()); - Logger.printDetail("DEBUGGING: m.group("+Integer.parseInt(mr.group(2))+"):"+m.group(Integer.parseInt(mr.group(2)))); - Logger.printDetail("DEBUGGING: hmR...:"+norm.getFromHmAllNormalization(mr.group(1)).get(m.group(Integer.parseInt(mr.group(2))))); - Logger.printDetail("-----------------------------------"); - - if (! (m.group(Integer.parseInt(mr.group(2))) == null)) { - String partToReplace = m.group(Integer.parseInt(mr.group(2))).replaceAll("[\n\\s]+", " "); - if (!(norm.getFromHmAllNormalization(mr.group(1)).containsKey(partToReplace))) { - Logger.printDetail("Maybe problem with normalization of the resource: "+mr.group(1)); - Logger.printDetail("Maybe problem with part to replace? "+partToReplace); - if (mr.group(1).contains("Temponym")){ - Logger.printDetail("Should be ok, as it's a temponym."); - return null; - } - } - else { - tonormalize = tonormalize.replace(mr.group(), norm.getFromHmAllNormalization(mr.group(1)).get(partToReplace)); - } - } else { - Logger.printDetail("Empty part to normalize in "+mr.group(1)); - - tonormalize = tonormalize.replace(mr.group(), ""); - } - } - // replace other groups - for (MatchResult mr : Toolbox.findMatches(paGroup,tonormalize)) { - Logger.printDetail("-----------------------------------"); - Logger.printDetail("DEBUGGING: tonormalize:"+tonormalize); - Logger.printDetail("DEBUGGING: mr.group():"+mr.group()); - Logger.printDetail("DEBUGGING: mr.group(1):"+mr.group(1)); - Logger.printDetail("DEBUGGING: m.group():"+m.group()); - Logger.printDetail("DEBUGGING: m.group("+Integer.parseInt(mr.group(1))+"):"+m.group(Integer.parseInt(mr.group(1)))); - Logger.printDetail("-----------------------------------"); - - tonormalize = tonormalize.replace(mr.group(), m.group(Integer.parseInt(mr.group(1)))); - } - // replace substrings - Pattern paSubstring = Pattern.compile("%SUBSTRING%\\((.*?),([0-9]+),([0-9]+)\\)"); - for (MatchResult mr : Toolbox.findMatches(paSubstring,tonormalize)) { - String substring = mr.group(1).substring(Integer.parseInt(mr.group(2)), Integer.parseInt(mr.group(3))); - tonormalize = tonormalize.replace(mr.group(),substring); - } - if(language.getName().compareTo("arabic") != 0) - { - // replace lowercase - Pattern paLowercase = Pattern.compile("%LOWERCASE%\\((.*?)\\)"); - for (MatchResult mr : Toolbox.findMatches(paLowercase,tonormalize)) { - String substring = mr.group(1).toLowerCase(); - tonormalize = tonormalize.replace(mr.group(),substring); - } - - // replace uppercase - Pattern paUppercase = Pattern.compile("%UPPERCASE%\\((.*?)\\)"); - for (MatchResult mr : Toolbox.findMatches(paUppercase,tonormalize)) { - String substring = mr.group(1).toUpperCase(); - tonormalize = tonormalize.replace(mr.group(),substring); - } - } - // replace sum, concatenation - Pattern paSum = Pattern.compile("%SUM%\\((.*?),(.*?)\\)"); - for (MatchResult mr : Toolbox.findMatches(paSum,tonormalize)) { - int newValue = Integer.parseInt(mr.group(1)) + Integer.parseInt(mr.group(2)); - tonormalize = tonormalize.replace(mr.group(), newValue+""); - } - // replace normalization function without group - Pattern paNormNoGroup = Pattern.compile("%([A-Za-z0-9]+?)\\((.*?)\\)"); - for (MatchResult mr : Toolbox.findMatches(paNormNoGroup, tonormalize)) { - tonormalize = tonormalize.replace(mr.group(),norm.getFromHmAllNormalization(mr.group(1)).get(mr.group(2))); - } - // replace Chinese with Arabic numerals - Pattern paChineseNorm = Pattern.compile("%CHINESENUMBERS%\\((.*?)\\)"); - for (MatchResult mr : Toolbox.findMatches(paChineseNorm, tonormalize)) { - RegexHashMap chineseNumerals = new RegexHashMap(); - chineseNumerals.put("[零00]", "0"); - chineseNumerals.put("[一11]", "1"); - chineseNumerals.put("[二22]", "2"); - chineseNumerals.put("[三33]", "3"); - chineseNumerals.put("[四44]", "4"); - chineseNumerals.put("[五55]", "5"); - chineseNumerals.put("[六66]", "6"); - chineseNumerals.put("[七77]", "7"); - chineseNumerals.put("[八88]", "8"); - chineseNumerals.put("[九99]", "9"); - String outString = ""; - for(Integer i = 0; i < mr.group(1).length(); i++) { - String thisChar = mr.group(1).substring(i, i+1); - if(chineseNumerals.containsKey(thisChar)){ - outString += chineseNumerals.get(thisChar); - } else { - System.out.println(chineseNumerals.entrySet()); - Logger.printError(component, "Found an error in the resources: " + mr.group(1) + " contains " + - "a character that is not defined in the Chinese numerals map. Normalization may be mangled."); - outString += thisChar; - } - } - tonormalize = tonormalize.replace(mr.group(), outString); - } - } - normalized = tonormalize; - return normalized; - } - - - public String[] getAttributesForTimexFromFile(String rule, - HashMap hmNormalization, - HashMap hmQuant, - HashMap hmFreq, - HashMap hmMod, - HashMap hmEmptyValue, - MatchResult m, - JCas jcas) { + + public String[] getAttributesForTimexFromFile(String key, Rule rule, MatchResult m, JCas jcas) { String[] attributes = new String[5]; - String value = ""; - String quant = ""; - String freq = ""; - String mod = ""; - String emptyValue = ""; - + // Normalize Value - String value_normalization_pattern = hmNormalization.get(rule); - value = applyRuleFunctions(value_normalization_pattern, m); - if (value == null) return null; - + String value_normalization_pattern = rule.getNormalization(); + NormalizationManager norm = NormalizationManager.getInstance(language, find_temponyms); + String value = RuleExpansion.applyRuleFunctions(key, value_normalization_pattern, m, norm, language); + if (value == null) + return null; + // For example "PT24H" -> "P1D" + if (group_gran) + value = DurationSimplification.simplify(value); + attributes[0] = value; + // get quant - if (hmQuant.containsKey(rule)) { - String quant_normalization_pattern = hmQuant.get(rule); - quant = applyRuleFunctions(quant_normalization_pattern, m); - } + String quant_normalization_pattern = rule.getQuant(); + attributes[1] = (quant_normalization_pattern != null) ? RuleExpansion.applyRuleFunctions(key, quant_normalization_pattern, m, norm, language) : ""; // get freq - if (hmFreq.containsKey(rule)) { - String freq_normalization_pattern = hmFreq.get(rule); - freq = applyRuleFunctions(freq_normalization_pattern, m); - } - + String freq_normalization_pattern = rule.getFreq(); + attributes[2] = (freq_normalization_pattern != null) ? RuleExpansion.applyRuleFunctions(key, freq_normalization_pattern, m, norm, language) : ""; + // get mod - if (hmMod.containsKey(rule)) { - String mod_normalization_pattern = hmMod.get(rule); - mod = applyRuleFunctions(mod_normalization_pattern, m); - } - + String mod_normalization_pattern = rule.getMod(); + attributes[3] = (mod_normalization_pattern != null) ? RuleExpansion.applyRuleFunctions(key, mod_normalization_pattern, m, norm, language) : ""; + // get emptyValue - if (hmEmptyValue.containsKey(rule)) { - String emptyValue_normalization_pattern = hmEmptyValue.get(rule); - emptyValue = applyRuleFunctions(emptyValue_normalization_pattern, m); - emptyValue = correctDurationValue(emptyValue); - } - // For example "PT24H" -> "P1D" - if (group_gran) - value = correctDurationValue(value); + String emptyValue_normalization_pattern = rule.getEmptyValue(); + attributes[4] = (emptyValue_normalization_pattern != null) ? // + DurationSimplification.simplify(RuleExpansion.applyRuleFunctions(key, emptyValue_normalization_pattern, m, norm, language)) : ""; - attributes[0] = value; - attributes[1] = quant; - attributes[2] = freq; - attributes[3] = mod; - attributes[4] = emptyValue; - return attributes; } - /** - * Durations of a finer granularity are mapped to a coarser one if possible, e.g., "PT24H" -> "P1D". - * One may add several further corrections. - * @param value - * @return - */ - public String correctDurationValue(String value) { - if (value.matches("PT[0-9]+H")){ - for (MatchResult mr : Toolbox.findMatches(Pattern.compile("PT([0-9]+)H"), value)){ - try { - int hours = Integer.parseInt(mr.group(1)); - if ((hours % 24) == 0){ - int days = hours / 24; - value = "P"+days+"D"; - } - } catch(NumberFormatException e) { - Logger.printDetail(component, "Couldn't do granularity conversion for " + value); - } - } - } else if (value.matches("PT[0-9]+M")){ - for (MatchResult mr : Toolbox.findMatches(Pattern.compile("PT([0-9]+)M"), value)){ - try { - int minutes = Integer.parseInt(mr.group(1)); - if ((minutes % 60) == 0){ - int hours = minutes / 60; - value = "PT"+hours+"H"; - } - } catch(NumberFormatException e) { - Logger.printDetail(component, "Couldn't do granularity conversion for " + value); - } - } - } else if (value.matches("P[0-9]+M")){ - for (MatchResult mr : Toolbox.findMatches(Pattern.compile("P([0-9]+)M"), value)){ - try { - int months = Integer.parseInt(mr.group(1)); - if ((months % 12) == 0){ - int years = months / 12; - value = "P"+years+"Y"; - } - } catch(NumberFormatException e) { - Logger.printDetail(component, "Couldn't do granularity conversion for " + value); - } - } - } - return value; - } - - /** - * Check whether or not a jcas object has a correct DCT value. - * If there is no DCT present, we canonically return true since - * fallback calculation takes care of that scenario. - * @param jcas - * @return Whether or not the given jcas contains a valid DCT + * takes a desired locale input string, iterates through available locales, returns a locale object + * + * @param locale + * String to grab a locale for, i.e. en_US, en_GB, de_DE + * @return Locale to represent the input String */ - private Boolean isValidDCT(JCas jcas) { - FSIterator dctIter = jcas.getAnnotationIndex(Dct.type).iterator(); - - if(!dctIter.hasNext()) { - return true; - } else { - Dct dct = (Dct) dctIter.next(); - String dctVal = dct.getValue(); - - if(dctVal == null) - return false; - - if(dctVal.matches("\\d{8}") // Something like 20041224 - || dctVal.matches("\\d{4}.\\d{2}.\\d{2}.*")) { // Something like 2004-12-24 - return true; - } else { - return false; - } - } + public static Locale getLocaleFromString(String locale) throws LocaleException { + for (Locale l : Locale.getAvailableLocales()) + if (locale.equalsIgnoreCase(l.toString())) + return l; + throw new LocaleException(); } } diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/ProcessorManager.java b/src/de/unihd/dbs/uima/annotator/heideltime/ProcessorManager.java index 11f06d73..0bd9f8c1 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/ProcessorManager.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/ProcessorManager.java @@ -5,9 +5,10 @@ import org.apache.uima.UimaContext; import org.apache.uima.jcas.JCas; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import de.unihd.dbs.uima.annotator.heideltime.processors.GenericProcessor; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; /** * This class implements a singleton "Addon Manager". Any subroutine (Processor) that * may be added to HeidelTime's code to achieve a specific goal which is self-sufficient, @@ -20,12 +21,13 @@ * */ public class ProcessorManager { + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(ProcessorManager.class); + // list of processes' package names private EnumMap> processorNames; // array of instantiated processors private EnumMap> processors; - // self-identifying component for logging purposes - private Class component; // flag for whether the processors have been initialized private boolean initialized = false; @@ -34,7 +36,6 @@ public class ProcessorManager { */ public ProcessorManager() { this.processorNames = new EnumMap>(Priority.class); - this.component = this.getClass(); this.processors = new EnumMap>(Priority.class); for(Priority prio : Priority.values()) { @@ -74,8 +75,7 @@ public void initializeAllProcessors(UimaContext aContext) { p.initialize(aContext); processors.get(prio).add(p); } catch (Exception exception) { - exception.printStackTrace(); - Logger.printError(component, "Unable to initialize registered Processor " + pn + ", got: " + exception.toString()); + LOG.error("Unable to initialize registered Processor " + pn + ", got: " + exception.toString(), exception); System.exit(-1); } } @@ -91,7 +91,7 @@ public void initializeAllProcessors(UimaContext aContext) { */ public void executeProcessors(JCas jcas, ProcessorManager.Priority prio) { if(!this.initialized) { - Logger.printError(component, "Unable to execute Processors; initialization was not concluded successfully."); + LOG.error("Unable to execute Processors; initialization was not concluded successfully."); System.exit(-1); } @@ -100,8 +100,7 @@ public void executeProcessors(JCas jcas, ProcessorManager.Priority prio) { try { gp.process(jcas); } catch (Exception exception) { - exception.printStackTrace(); - Logger.printError(component, "Unable to process registered Processor " + gp.getClass().getName() + ", got: " + exception.toString()); + LOG.error("Unable to process registered Processor " + gp.getClass().getName() + ", got: " + exception.toString(), exception); System.exit(-1); } } diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/ResolveAmbiguousValues.java b/src/de/unihd/dbs/uima/annotator/heideltime/ResolveAmbiguousValues.java new file mode 100644 index 00000000..2646ee21 --- /dev/null +++ b/src/de/unihd/dbs/uima/annotator/heideltime/ResolveAmbiguousValues.java @@ -0,0 +1,777 @@ +package de.unihd.dbs.uima.annotator.heideltime; + +import static de.unihd.dbs.uima.annotator.heideltime.utilities.ContextAnalyzer.*; +import static de.unihd.dbs.uima.annotator.heideltime.utilities.DateCalculator.*; +import static de.unihd.dbs.uima.annotator.heideltime.utilities.ParseInteger.*; + +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.jcas.JCas; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import de.unihd.dbs.uima.annotator.heideltime.resources.Language; +import de.unihd.dbs.uima.annotator.heideltime.resources.NormalizationManager; +import de.unihd.dbs.uima.annotator.heideltime.utilities.ContextAnalyzer.Tense; +import de.unihd.dbs.uima.annotator.heideltime.utilities.Season; +import de.unihd.dbs.uima.types.heideltime.Dct; +import de.unihd.dbs.uima.types.heideltime.Timex3; + +class ResolveAmbiguousValues { + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(ResolveAmbiguousValues.class); + + private static final Pattern UNDEF_PATTERN = Pattern.compile("^UNDEF-(this|REFUNIT|REF)-(.*?)-(MINUS|PLUS)-([0-9]+)"); + + private static final Pattern UNDEF_UNIT = Pattern.compile("^UNDEF-(last|this|next)-(century|decade|year|quarter|month|week|day)"); + + private static final Pattern UNDEF_MONTH = Pattern.compile("^UNDEF-(last|this|next)-(january|february|march|april|may|june|july|august|september|october|november|december)(?:-([0-9][0-9]))?"); + + private static final Pattern UNDEF_SEASON = Pattern.compile("^UNDEF-(last|this|next)-(SP|SU|FA|WI)"); + + private static final Pattern UNDEF_WEEKDAY = Pattern.compile("^UNDEF-(last|this|next|day)-(monday|tuesday|wednesday|thursday|friday|saturday|sunday)"); + + private static final Pattern TWO_DIGITS = Pattern.compile("^\\d\\d$"); + + private static final Pattern THREE_DIGITS = Pattern.compile("^\\d\\d\\d$"); + + // Document creation time + public static class ParsedDct { + String dctValue = ""; + int dctCentury = 0, dctYear = 0, dctDecade = 0, dctMonth = 0, dctDay = 0; + Season dctSeason = null; + String dctQuarter = ""; + String dctHalf = ""; + int dctWeekday = 0, dctWeek = 0; + + private ParsedDct(String dctValue) { + // year, month, day as mentioned in the DCT + dctYear = parseInt(dctValue, 0, 4); + dctCentury = dctYear / 100; + dctDecade = parseInt(dctValue, 2, 3); + // Could be separated by slashes, or not. + if (Character.isDigit(dctValue.charAt(4))) { + dctMonth = parseInt(dctValue, 4, 6); + dctDay = parseInt(dctValue, 6, 8); + } else { + dctMonth = parseInt(dctValue, 5, 7); + dctDay = parseInt(dctValue, 8, 10); + } + dctQuarter = getQuarterOfMonth(dctMonth); + dctHalf = getHalfYearOfMonth(dctMonth); + + // season, week, weekday, have to be calculated + dctSeason = getSeasonOfMonth(dctMonth); + dctWeekday = getWeekdayOfDate(dctYear, dctMonth, dctDay); + dctWeek = getWeekOfDate(dctYear, dctMonth, dctDay); + + if (LOG.isDebugEnabled()) { + LOG.debug("dctCentury: {}", dctCentury); + LOG.debug("dctYear: {}", dctYear); + LOG.debug("dctDecade: {}", dctDecade); + LOG.debug("dctMonth: {}", dctMonth); + LOG.debug("dctDay: {}", dctDay); + LOG.debug("dctQuarter: {}", dctQuarter); + LOG.debug("dctSeason: {}", dctSeason); + LOG.debug("dctWeekday: {}", dctWeekday); + LOG.debug("dctWeek: {}", dctWeek); + } + } + + public static ParsedDct read(JCas jcas) { + String dctString = getDct(jcas); + return dctString != null ? new ParsedDct(dctString) : null; + } + + public static String getDct(JCas jcas) { + AnnotationIndex dcts = jcas.getAnnotationIndex(Dct.type); + FSIterator dctIter = dcts.iterator(); + return dctIter.hasNext() ? dctIter.next().getValue() : null; + } + + private static final Pattern VALID_DCT = Pattern.compile("^\\d{4}[.-]?\\d{2}[.-]?\\d{2}"); + + /** + * Check whether or not a jcas object has a correct DCT value. If there is no DCT present, we canonically return true since fallback calculation takes care of that scenario. + * + * @param jcas + * @return Whether or not the given jcas contains a valid DCT + */ + public static boolean isValidDCT(JCas jcas) { + String dctString = getDct(jcas); + // Something like 20041224 or 2004-12-24 + return dctString == null || VALID_DCT.matcher(dctString).find(); + } + } + + NormalizationManager norm; + + Language language; + + private DocumentType documentType; + + public void init(Language language, boolean find_temponyms, DocumentType typeToProcess) { + if (this.language != language) { + this.language = language; + norm = NormalizationManager.getInstance(language, find_temponyms); + } + this.documentType = typeToProcess; + } + + public String specifyAmbiguousValuesString(String ambigString, Timex3 t_i, int i, List linearDates, JCas jcas) { + if (!ambigString.startsWith("UNDEF")) + return ambigString; + // If available, parse document creation time: + ParsedDct dct = ParsedDct.read(jcas); // was: (documentType != DocumentType.NARRATIVE) ? ParsedDct.read(jcas) : null; + + // get the last tense (depending on the part of speech tags used in front or behind the expression) + Tense last_used_tense = getLastTense(t_i, jcas, language); + + // DISAMBIGUATION PHASE: + if (ambigString.equals("UNDEF-REFDATE")) + return i > 0 ? linearDates.get(i - 1).getTimexValue() : "XXXX-XX-XX"; + // Different patterns: + String repl = handleUndefYear(ambigString, linearDates, i, dct, last_used_tense); + repl = repl != null ? repl : handleUndefCentury(ambigString, linearDates, i, dct, last_used_tense); + repl = repl != null ? repl : handleUndefPlusMinus(ambigString, linearDates, i, dct); + repl = repl != null ? repl : handleUndefNextPrevThis(ambigString, linearDates, i, dct); + repl = repl != null ? repl : handleUndefMonth(ambigString, linearDates, i, dct); + repl = repl != null ? repl : handleUndefSeason(ambigString, linearDates, i, dct); + repl = repl != null ? repl : handleUndefWeekday(ambigString, linearDates, i, dct, last_used_tense); + if (repl == null) { + LOG.warn("Unhandled UNDEF value: {}", ambigString); + return ambigString; + } + return repl; + } + + private String handleUndefPlusMinus(String ambigString, List linearDates, int i, ParsedDct dct) { + Matcher m = UNDEF_PATTERN.matcher(ambigString); + if (!m.find()) + return null; + boolean fuzz = !ambigString.regionMatches(m.start(1), "REFUNIT", 0, 7); + String unit = m.group(2); + boolean positive = ambigString.regionMatches(m.start(3), "PLUS", 0, 4); // May only be PLUS or MINUS. + try { + int diff = parseInt(ambigString, m.start(4), m.end(4)); + diff = positive ? diff : -diff; // Signed diff + String rep = adjustByUnit(linearDates, i, dct, unit, diff, fuzz); + if (rep == null) + return ambigString; + StringBuilder valueNew = join(rep, ambigString, m.end()); + if ("year".equals(unit)) + handleFiscalYear(valueNew); + return valueNew.toString(); + } catch (NumberFormatException e) { + LOG.error("Invalid integer {} in {}", m.group(4), ambigString); + return positive ? "FUTURE_REF" : "PAST_REF"; + } + } + + private String handleUndefNextPrevThis(String ambigString, List linearDates, int i, ParsedDct dct) { + Matcher m = UNDEF_UNIT.matcher(ambigString); + if (!m.find()) + return null; + String rel = m.group(1), unit = m.group(2); + int sdiff = 0; + switch (rel) { + case "this": + break; + case "last": + sdiff = -1; + break; + case "next": + sdiff = +1; + break; + default: + LOG.warn("Unknown relationship {} in {}", rel, ambigString); + return null; + } + String rep = adjustByUnit(linearDates, i, dct, unit, sdiff, true); + if (rep == null) + return ambigString; + StringBuilder valueNew = join(rep, ambigString, m.end()); + if ("year".equals(unit)) + handleFiscalYear(valueNew); + return valueNew.toString(); + } + + /** + * Adjust a date. + * + * @param linearDates + * Date mentions + * @param i + * Position + * @param dct + * Document creation time + * @param unit + * Unit + * @param sdiff + * Difference + * @param fuzz + * Fuzzing factor + * @return Adjusted date, or null. + */ + private String adjustByUnit(List linearDates, int i, ParsedDct dct, String unit, int sdiff, boolean fuzz) { + // do the processing for SCIENTIFIC documents (TPZ identification could be improved) + if (documentType == DocumentType.SCIENTIFIC) + return formatScientific(unit, sdiff); + // TODO: BC dates are likely not handled correctly everywhere, although some cases may never occur, because we won't have day information BC. + switch (unit) { + case "century": + if (dct != null) + return norm.normNumber(dct.dctCentury + sdiff); + String lmCentury = getLastMentionedCentury(linearDates, i); + return lmCentury.isEmpty() ? "XX" : getXNextCentury(lmCentury, sdiff); + case "decade": + if (dct != null) + return (Integer.toString(dct.dctYear + sdiff * 10)).substring(0, 3); + String lmDecade = getLastMentionedDecade(linearDates, i); + return lmDecade.isEmpty() ? "XXXX" : getXNextDecade(lmDecade, sdiff); + case "year": + if (fuzz) { // Use year precision + if (dct != null) + return Integer.toString(dct.dctYear + sdiff); + String lmYear = getLastMentionedYear(linearDates, i); + return lmYear.isEmpty() ? "XXXX" : getXNextYear(lmYear, sdiff); + } + // Use day precision, if possible + // FIXME: Use dct? + String dateWithYear = getLastMentionedDateYear(linearDates, i); + if (dateWithYear.length() == 0) + return "XXXX"; + // FIXME: clean up BC handling! + final int p = dateWithYear.startsWith("BC") ? 6 : 4; + String year = dateWithYear.substring(0, p); + String rest = dateWithYear.substring(p); + String yearNew = getXNextYear(year, sdiff); + return yearNew + rest; + case "quarter": + // TODO: assert not BC? + if (dct != null) { + // Use quarters, 0 to 3, for computation. + int quarters = (dct.dctYear << 2) + parseIntAt(dct.dctQuarter, 1) - 1 + sdiff; + return (quarters >> 2) + "-Q" + ((quarters & 0x3) + 1); + } + String lmQuarter = getLastMentionedQuarter(linearDates, i, language); + if (lmQuarter.isEmpty()) + return "XXXX-XX"; + // Use quarters, 0 to 3, for computation. + int quarters = (parseInt(lmQuarter, 0, 4) << 2) + parseIntAt(lmQuarter, 6) - 1 + sdiff; + return (quarters >> 2) + "-Q" + ((quarters & 0x3) + 1); + case "month": + // TODO: assert not BC? + if (dct != null) + return getXNextMonth(dct.dctYear + "-" + norm.normNumber(dct.dctMonth), sdiff); + String lmMonth = getLastMentionedMonth(linearDates, i); + return lmMonth.isEmpty() ? "XXXX-XX" : getXNextMonth(lmMonth, sdiff); + case "week": + // TODO: assert not BC? + if (fuzz /* && (sdiff > 1 || sdiff < -1) */) { // Use week precision + if (dct != null) + return getXNextWeek(dct.dctYear + "-W" + norm.normNumber(dct.dctWeek), sdiff); + String lmWeek = getLastMentionedWeek(linearDates, i); + return lmWeek.isEmpty() ? "XXXX-WXX" : getXNextWeek(lmWeek, sdiff); + } + // Use day precision, if possible + if (dct != null) + return getXNextDay(dct.dctYear, dct.dctMonth, dct.dctDay, sdiff * 7); + String lmDayW = getLastMentionedDay(linearDates, i); + return lmDayW.isEmpty() ? "XXXX-WXX" : getXNextDay(lmDayW, sdiff * 7); + case "day": + if (dct != null) + return getXNextDay(dct.dctYear, dct.dctMonth, dct.dctDay, sdiff); + String lmDay = getLastMentionedDay(linearDates, i); + return lmDay.isEmpty() ? "XXXX-XX-XX" : getXNextDay(lmDay, sdiff); + case "minute": + case "second": + case "hour": + // FIXME: support these, too? + return null; + case "week-WE": + // TODO: assert not BC? + if (fuzz /* && (sdiff > 1 || sdiff < -1) */) { // Use week precision + if (dct != null) + return getXNextWeek(dct.dctYear + "-W" + norm.normNumber(dct.dctWeek), sdiff); + String lmWeek = getLastMentionedWeek(linearDates, i); + return lmWeek.isEmpty() ? "XXXX-WXX-WE" : getXNextWeek(lmWeek, sdiff); + } + // Use day precision, if possible + if (dct != null) + return getXNextWeek(dct.dctYear, dct.dctMonth, dct.dctDay, sdiff) + "-WE"; + String lmWeek = getLastMentionedWeek(linearDates, i); + return lmWeek.isEmpty() ? "XXXX-WXX-WE" : getXNextWeek(lmWeek, sdiff) + "-WE"; + default: + LOG.warn("Unknown unit {}", unit); + return null; + } + } + + private String formatScientific(String unit, int sdiff) { + final String fmt; + switch (unit) { + case "year": + fmt = "TPZ%c%04d"; + break; + case "month": + fmt = "TPZ%c0000-%02d"; + break; + case "week": + fmt = "TPZ%c0000-W%02d"; + break; + case "day": + fmt = "TPZ%c0000-00-%02d"; + break; + case "hour": + fmt = "TPZ%c0000-00-00T%02d"; + break; + case "minute": + fmt = "TPZ%c0000-00-00T00:%02d"; + break; + case "second": + fmt = "TPZ%c0000-00-00T00:00:%02d"; + break; + default: + LOG.error("no scientific format for unit type {}", unit); + return null; + } + return String.format(Locale.ROOT, fmt, sdiff >= 0 ? '+' : '-', Math.abs(sdiff)); + } + + private String handleUndefYear(String ambigString, List linearDates, int i, ParsedDct dct, Tense last_used_tense) { + if (!ambigString.startsWith("UNDEF-year")) + return null; + last_used_tense = last_used_tense != null ? last_used_tense // + // In COLLOQUIAL, default to present/future, otherwise assume past (if undefined). + : (documentType == DocumentType.COLLOQUIAL ? Tense.PRESENTFUTURE : Tense.PAST); + String[] valueParts = ambigString.split("-"); + String repl; + if (dct != null && valueParts.length > 2) { + int newYear = dct.dctYear; + String part2 = valueParts[2]; + Season viThisSeason; + // get vi month + if (TWO_DIGITS.matcher(part2).matches()) { + // FIXME: check range of month and day? + int viThisMonth = parseInt(part2); + // Get day in vi + int viThisDay = (valueParts.length > 3 && TWO_DIGITS.matcher(valueParts[3]).matches()) // + ? parseInt(valueParts[3]) : -1; + // Tense is FUTURE + if (last_used_tense == Tense.FUTURE) { // || last_used_tense == Tense.PRESENTFUTURE) { + // if dct-month is larger than vi-month, then add 1 to dct-year + if (dct.dctMonth > viThisMonth || // + (dct.dctMonth == viThisMonth && viThisDay > 0 && dct.dctDay > viThisDay)) + ++newYear; + } + // Tense is PAST + else if (last_used_tense == Tense.PAST) { + // if dct-month is smaller than vi month, then subtract 1 from dct-year + if (dct.dctMonth < viThisMonth || // + (dct.dctMonth == viThisMonth && viThisDay > 0 && dct.dctDay < viThisDay)) + --newYear; + } + } + // get vi season + else if ((viThisSeason = Season.of(part2)) != null) { + // Tense is FUTURE + if (last_used_tense == Tense.FUTURE) { // || last_used_tense == Tense.PRESENTFUTURE) { + // if dct-month is larger than vi-month, then add 1 to dct-year + if (dct.dctSeason.ord() > viThisSeason.ord()) + ++newYear; + } + // Tense is PAST + else if (last_used_tense == Tense.PAST) { + // if dct-month is smaller than vi month, then subtract 1 from dct-year + if (dct.dctSeason.ord() < viThisSeason.ord()) + --newYear; + } + } + // get vi quarter + else if (part2.charAt(0) == 'Q' && part2.charAt(1) >= '1' && part2.charAt(1) <= '4') { + // Tense is FUTURE + if (last_used_tense == Tense.FUTURE) { // || last_used_tense == Tense.PRESENTFUTURE) { + if (parseIntAt(dct.dctQuarter, 1) > parseIntAt(part2, 1)) + ++newYear; + } + // Tense is PAST + if (last_used_tense == Tense.PAST) { + if (parseIntAt(dct.dctQuarter, 1) < parseIntAt(part2, 1)) + --newYear; + } + } + // get vi half + else if (part2.charAt(0) == 'H' && (part2.equals("H1") || part2.equals("H2"))) { + // Tense is FUTURE + if (last_used_tense == Tense.FUTURE) { // || last_used_tense == Tense.PRESENTFUTURE) { + if (parseIntAt(dct.dctHalf, 1) > parseIntAt(part2, 1)) + ++newYear; + } + // Tense is PAST + if (last_used_tense == Tense.PAST) { + if (parseIntAt(dct.dctHalf, 1) < parseIntAt(part2, 1)) + --newYear; + } + } + // get vi Week + else if (part2.charAt(0) == 'W') { + // Tense is FUTURE + if (last_used_tense == Tense.FUTURE) { // || last_used_tense == Tense.PRESENTFUTURE) { + if (dct.dctWeek > parseIntAt(part2, 1)) + ++newYear; + } + // Tense is PAST + if (last_used_tense == Tense.PAST) { + if (dct.dctWeek < parseIntAt(part2, 1)) + --newYear; + } + } + repl = Integer.toString(newYear); + } else { + repl = getLastMentionedYear(linearDates, i); + if (repl.isEmpty()) + repl = "XXXX"; + } + // REPLACE THE UNDEF-YEAR WITH THE NEWLY CALCULATED YEAR + return join(repl, ambigString, "UNDEF-year".length()).toString(); + } + + private String handleUndefCentury(String ambigString, List linearDates, int i, ParsedDct dct, Tense last_used_tense) { + if (!ambigString.startsWith("UNDEF-century")) + return null; + String repl = dct != null ? Integer.toString(dct.dctCentury) : ""; + + // FIXME: supposed to be NEWS and COLLOQUIAL DOCUMENTS + if (dct != null) { + int viThisDecade = parseInt(ambigString, 13, 14); + // Tense is FUTURE + if (last_used_tense == Tense.FUTURE || last_used_tense == Tense.PRESENTFUTURE) + repl = Integer.toString(dct.dctCentury + (viThisDecade < dct.dctDecade ? 1 : 0)); + // Tense is PAST + else if (last_used_tense == Tense.PAST) + repl = Integer.toString(dct.dctCentury - (dct.dctDecade < viThisDecade ? 1 : 0)); + } + // NARRATIVE DOCUMENTS + else { + repl = getLastMentionedCentury(linearDates, i); + if (!repl.startsWith("BC")) { + if (repl.matches("^\\d\\d.*") && parseInt(repl, 0, 2) < 10) + repl = "00"; + } else { + repl = "00"; + } + } + // LREC change: assume in narrative-style documents that + // if no other century was mentioned before, 1st century + // Otherwise, assume that sixties, twenties, and so on + // are 19XX if no century found (LREC change) + if (repl.isEmpty()) + repl = (documentType == DocumentType.NARRATIVE ? "00" : "19"); + StringBuilder valueNew = join(repl, ambigString, "UNDEF-century".length()); + // always assume that sixties, twenties, and so on are 19XX -- if + // not narrative document (LREC change) + if (documentType != DocumentType.NARRATIVE && THREE_DIGITS.matcher(valueNew).matches()) + valueNew.replace(0, 2, "19"); + return valueNew.toString(); + } + + private String handleUndefMonth(String ambigString, List linearDates, int i, ParsedDct dct) { + Matcher m = UNDEF_MONTH.matcher(ambigString); + if (!m.find()) + return null; + String ltn = m.group(1), newMonth = norm.getFromNormMonthName(m.group(2)), daystr = m.group(3); + String repl = "XXXX-XX"; + if (ltn.equals("last")) { + if (dct != null) { + int newYear = dct.dctYear; + int newMonthInt = parseInt(newMonth); + int day = (daystr != null && daystr.length() > 0) ? parseInt(daystr) : 0; + // check day if dct-month and newMonth are equal + if (dct.dctMonth == newMonthInt) { + if (day != 0 && dct.dctDay <= day) + --newYear; + } else if (dct.dctMonth <= newMonthInt) + --newYear; + // TODO: 'format' year? could be < 1000. + repl = newYear + "-" + newMonth; + } else { + String lmMonth = getLastMentionedMonthDetails(linearDates, i); + if (!lmMonth.isEmpty()) { + int lmMonthInt = parseInt(lmMonth, 5, 7); + int lmDayInt = 0; + if (lmMonth.length() > 9 && TWO_DIGITS.matcher(lmMonth.subSequence(8, 10)).matches()) + lmDayInt = parseInt(lmMonth, 8, 10); + int newYear = parseInt(lmMonth, 0, 4); + int newMonthInt = parseInt(newMonth); + int day = (daystr != null && daystr.length() > 0) ? parseInt(daystr) : 0; + if (lmMonthInt == newMonthInt) { + if (lmDayInt != 0 && day != 0 && lmDayInt <= day) + --newYear; + } else if (lmMonthInt <= newMonthInt) + --newYear; + // TODO: 'format' year? could be < 1000. + repl = newYear + "-" + newMonth; + } + } + } else if (ltn.equals("this")) { + if (dct != null) { + // TODO: 'format' year? could be < 1000. + repl = dct.dctYear + "-" + newMonth; + } else { + String lmMonth = getLastMentionedMonthDetails(linearDates, i); + if (!lmMonth.isEmpty()) + repl = lmMonth.substring(0, 4) + "-" + newMonth; + } + } else if (ltn.equals("next")) { + if (dct != null) { + int newYear = dct.dctYear; + int newMonthInt = parseInt(newMonth); + int day = (daystr != null && daystr.length() > 0) ? parseInt(daystr) : 0; + // check day if dct-month and newMonth are equal + if (dct.dctMonth == newMonthInt) { + if (day != 0 && dct.dctDay >= day) + ++newYear; + } else if (dct.dctMonth >= newMonthInt) + ++newYear; + // TODO: 'format' year? could be < 1000. + repl = newYear + "-" + newMonth; + } else { + String lmMonth = getLastMentionedMonthDetails(linearDates, i); + if (!lmMonth.isEmpty()) { + int newYear = parseInt(lmMonth, 0, 4), lmMonthInt = parseInt(lmMonth, 5, 7); + int newMonthInt = parseInt(newMonth); + if (lmMonthInt >= newMonthInt) + ++newYear; + // TODO: 'format' year? could be < 1000. + repl = newYear + "-" + newMonth; + } + } + } else { + LOG.warn("Unhandled undef-month: {}", ltn); + } + return join(repl, ambigString, m.end()).toString(); + } + + private String handleUndefSeason(String ambigString, List linearDates, int i, ParsedDct dct) { + Matcher m = UNDEF_SEASON.matcher(ambigString); + if (!m.find()) + return null; + String ltn = m.group(1); + Season newSeason = Season.of(ambigString, m.start(2)); + String repl = "XXXX-XX"; + if (ltn.equals("last")) { + if (dct != null) { + int newYear = dct.dctYear - (newSeason.ord() < dct.dctSeason.ord() // + || (dct.dctSeason == Season.WINTER && dct.dctMonth < 12) // + ? 1 : 0); + // TODO: 'format' year? could be < 1000. + repl = newYear + "-" + newSeason; + } else { // NARRATVIE DOCUMENT + String lmSeason = getLastMentionedSeason(linearDates, i, language); + if (lmSeason != null && !lmSeason.isEmpty()) { + Season se = Season.of(lmSeason, 5); + int newYear = parseInt(lmSeason, 0, 4) - (newSeason.ord() < se.ord() ? 1 : 0); + // TODO: 'format' year? could be < 1000. + repl = newYear + "-" + newSeason; + } + } + } else if (ltn.equals("this")) { + // TODO use tense of sentence? + if (dct != null) { + // TODO: 'format' year? could be < 1000. + repl = dct.dctYear + "-" + newSeason; + } else { + String lmSeason = getLastMentionedSeason(linearDates, i, language); + if (lmSeason != null && !lmSeason.isEmpty()) + repl = lmSeason.substring(0, 4) + "-" + newSeason; + } + } else if (ltn.equals("next")) { + if (dct != null) { + int newYear = dct.dctYear + (newSeason.ord() <= dct.dctSeason.ord() ? 1 : 0); + // TODO: 'format' year? could be < 1000. + repl = newYear + "-" + newSeason; + } else { // NARRATIVE DOCUMENT + String lmSeason = getLastMentionedSeason(linearDates, i, language); + if (lmSeason != null && !lmSeason.isEmpty()) { + Season se = Season.of(lmSeason, 5); + int newYear = parseInt(lmSeason, 0, 4) + (newSeason.ord() <= se.ord() ? 1 : 0); + // TODO: 'format' year? could be < 1000. + repl = newYear + "-" + newSeason; + } + } + } else { + LOG.warn("Unhandled undef-season: {}", ltn); + } + return join(repl, ambigString, m.end()).toString(); + } + + private String handleUndefWeekday(String ambigString, List linearDates, int i, ParsedDct dct, Tense last_used_tense) { + Matcher m = UNDEF_WEEKDAY.matcher(ambigString); + if (!m.find()) + return null; + // TODO (before refactoring:) the calculation is strange, but works + // But we improved this during refactoring, is it less strange now? + // TODO tense should be included?! + String ltnd = m.group(1), newWeekday = m.group(2); + int newWeekdayInt = parseInt(norm.getFromNormDayInWeek(newWeekday)); + String repl = "XXXX-XX-XX"; + if (ltnd.equals("last")) { + if (dct != null) { + int diff = -(dct.dctWeekday - newWeekdayInt); + diff = (diff >= 0) ? diff - 7 : diff; + repl = getXNextDay(dct.dctYear, dct.dctMonth, dct.dctDay, diff); + } else { + String lmDay = getLastMentionedDay(linearDates, i); + if (!lmDay.isEmpty()) { + int lmWeekdayInt = getWeekdayOfDate(lmDay); + int diff = -(lmWeekdayInt - newWeekdayInt); + diff = (diff >= 0) ? diff - 7 : diff; + repl = getXNextDay(lmDay, diff); + } + } + } else if (ltnd.equals("this")) { + if (dct != null) { + // TODO tense should be included?! + int diff = -(dct.dctWeekday - newWeekdayInt); + diff = (diff > 0) ? diff - 7 : diff; + repl = getXNextDay(dct.dctYear, dct.dctMonth, dct.dctDay, diff); + } else { + // TODO tense should be included?! + String lmDay = getLastMentionedDay(linearDates, i); + if (!lmDay.isEmpty()) { + int lmWeekdayInt = getWeekdayOfDate(lmDay); + int diff = -(lmWeekdayInt - newWeekdayInt); + diff = (diff > 0) ? diff - 7 : diff; + repl = getXNextDay(lmDay, diff); + } + } + } else if (ltnd.equals("next")) { + if (dct != null) { + int diff = newWeekdayInt - dct.dctWeekday; + diff = (diff <= 0) ? diff + 7 : diff; + repl = getXNextDay(dct.dctYear, dct.dctMonth, dct.dctDay, diff); + } else { + String lmDay = getLastMentionedDay(linearDates, i); + if (!lmDay.isEmpty()) { + int lmWeekdayInt = getWeekdayOfDate(lmDay); + int diff = newWeekdayInt - lmWeekdayInt; + diff = (diff <= 0) ? diff + 7 : diff; + repl = getXNextDay(lmDay, diff); + } + } + } else if (ltnd.equals("day")) { + if (dct != null) { + // TODO tense should be included?! + int diff = -(dct.dctWeekday - newWeekdayInt); + diff = (diff > 0) ? diff - 7 : diff; + // Tense is FUTURE + if ((last_used_tense == Tense.FUTURE) && diff != 0) + diff += 7; + // Tense is PAST + // if ((last_used_tense == Tense.PAST)) ? + repl = getXNextDay(dct.dctYear, dct.dctMonth, dct.dctDay, diff); + } else { + // TODO tense should be included?! + String lmDay = getLastMentionedDay(linearDates, i); + if (!lmDay.isEmpty()) { + int lmWeekdayInt = getWeekdayOfDate(lmDay); + int diff = -(lmWeekdayInt - newWeekdayInt); + diff = (diff > 0) ? diff - 7 : diff; + repl = getXNextDay(lmDay, diff); + } + } + } else { + LOG.warn("Unhandled undef-weekday: {}", ltnd); + } + return join(repl, ambigString, m.end()).toString(); + } + + /** + * Join pre-string + post-string beginning at offsetPost, effectively replacing the first offsetPost characters with the pre string. + * + * @param pre + * Prefix + * @param post + * Postfix + * @param offsetPost + * Number of chars in postfix to skip. + * @return String builder, for futher modification + */ + private static StringBuilder join(String pre, String post, final int offsetPost) { + StringBuilder valueNew = new StringBuilder(pre.length() + post.length() - offsetPost); + valueNew.append(pre); + valueNew.append(post, offsetPost, post.length()); + return valueNew; + } + + /** + * Under-specified values are disambiguated here. Only Timexes of types "date" and "time" can be under-specified. + * + * @param jcas + */ + public void specifyAmbiguousValues(JCas jcas) { + // build up a list with all found TIMEX expressions + List linearDates = new ArrayList(); + AnnotationIndex timexes = jcas.getAnnotationIndex(Timex3.type); + + // Create List of all Timexes of types "date" and "time" + for (Timex3 timex : timexes) { + if (timex.getTimexType().equals("DATE") || timex.getTimexType().equals("TIME")) + linearDates.add(timex); + + if (timex.getTimexType().equals("DURATION") && timex.getEmptyValue().length() > 0) + linearDates.add(timex); + } + + ////////////////////////////////////////////// + // go through list of Date and Time timexes // + ////////////////////////////////////////////// + for (int i = 0; i < linearDates.size(); i++) { + Timex3 t_i = linearDates.get(i); + String value_i = t_i.getTimexValue(); + + String valueNew = value_i; + // handle the value attribute only if we have a TIME or DATE + if (t_i.getTimexType().equals("TIME") || t_i.getTimexType().equals("DATE")) + valueNew = specifyAmbiguousValuesString(value_i, t_i, i, linearDates, jcas); + + // handle the emptyValue attribute for any type + if (t_i.getEmptyValue() != null && t_i.getEmptyValue().length() > 0) + t_i.setEmptyValue(specifyAmbiguousValuesString(t_i.getEmptyValue(), t_i, i, linearDates, jcas)); + + t_i.removeFromIndexes(); + if (LOG.isDebugEnabled() && !valueNew.equals(t_i.getTimexValue())) + LOG.debug("{} {} DISAMBIGUATION: foundBy: {} text: {} value: {} NEW value: {} ", // + t_i.getSentId(), t_i.getTimexId(), t_i.getFoundByRule(), t_i.getCoveredText(), t_i.getTimexValue(), valueNew); + + t_i.setTimexValue(valueNew); + t_i.addToIndexes(); + linearDates.set(i, t_i); + } + } + + /** + * Convert a -FY postfix to a FY prefix. + * + * @param buf + * Buffer to operate on + */ + private static void handleFiscalYear(StringBuilder buf) { + if (buf.length() < 4) + return; + // Unfortunately, StringBuilder does not have and "endsWith". + int p = buf.length() - 3; + if (buf.charAt(p) == '-' && buf.charAt(++p) == 'F' && buf.charAt(++p) == 'Y') { + // Keep at most the year: + buf.setLength(Math.min(p, 4)); + buf.insert(0, "FY"); + } + } +} \ No newline at end of file diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/processors/DecadeProcessor.java b/src/de/unihd/dbs/uima/annotator/heideltime/processors/DecadeProcessor.java index 5f513624..76476c53 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/processors/DecadeProcessor.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/processors/DecadeProcessor.java @@ -6,36 +6,32 @@ import java.util.regex.Pattern; import org.apache.uima.UimaContext; -import org.apache.uima.cas.FSIterator; import org.apache.uima.jcas.JCas; import de.unihd.dbs.uima.types.heideltime.Timex3; public class DecadeProcessor extends GenericProcessor { - /** * Constructor just calls the parent constructor here. */ public DecadeProcessor() { super(); } - /** - * not needed here + * not needed here */ public void initialize(UimaContext aContext) { return; } - + /** * all the functionality was put into evaluateCalculationFunctions(). */ public void process(JCas jcas) { evaluateFunctions(jcas); } - - + /** * This function replaces function calls from the resource files with their TIMEX value. * @@ -43,44 +39,33 @@ public void process(JCas jcas) { * @param jcas */ public void evaluateFunctions(JCas jcas) { - // build up a list with all found TIMEX expressions List linearDates = new ArrayList(); - FSIterator iterTimex = jcas.getAnnotationIndex(Timex3.type).iterator(); + Iterable timexes = jcas.getAnnotationIndex(Timex3.type); // Create List of all Timexes of types "date" and "time" - while (iterTimex.hasNext()) { - Timex3 timex = (Timex3) iterTimex.next(); - if (timex.getTimexType().equals("DATE")) { + for (Timex3 timex : timexes) + if (timex.getTimexType().equals("DATE")) linearDates.add(timex); - } - } - - + ////////////////////////////////////////////// // go through list of Date and Time timexes // ////////////////////////////////////////////// - //compile regex pattern for validating commands/arguments - Pattern cmd_p = Pattern.compile("(\\w\\w\\w\\w)-(\\w\\w)-(\\w\\w)\\s+decadeCalc\\((\\d+)\\)"); + // compile regex pattern for validating commands/arguments + Matcher cmd_p = Pattern.compile("(\\w\\w\\w\\w)-(\\w\\w)-(\\w\\w)\\s+decadeCalc\\((\\d+)\\)").matcher(""); - Matcher cmd_m; - String year; - String valueNew; - String argument; - for (int i = 0; i < linearDates.size(); i++) { - Timex3 t_i = (Timex3) linearDates.get(i); + Timex3 t_i = linearDates.get(i); String value_i = t_i.getTimexValue(); - cmd_m = cmd_p.matcher(value_i); - valueNew = value_i; - - if(cmd_m.matches()) { - year = cmd_m.group(1); - argument = cmd_m.group(4); - + Matcher cmd_m = cmd_p.reset(value_i); + String valueNew = value_i; + + if (cmd_m.matches()) { + String year = cmd_m.group(1); + String argument = cmd_m.group(4); valueNew = year.substring(0, Math.min(2, year.length())) + argument.substring(0, 1); } - + t_i.removeFromIndexes(); t_i.setTimexValue(valueNew); t_i.addToIndexes(); diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/processors/HolidayProcessor.java b/src/de/unihd/dbs/uima/annotator/heideltime/processors/HolidayProcessor.java index 56bc67c5..12bb8dbf 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/processors/HolidayProcessor.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/processors/HolidayProcessor.java @@ -1,27 +1,30 @@ package de.unihd.dbs.uima.annotator.heideltime.processors; -import java.text.ParseException; -import java.text.SimpleDateFormat; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; +import java.time.temporal.WeekFields; import java.util.ArrayList; -import java.util.Calendar; -import java.util.Date; -import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.uima.UimaContext; -import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.jcas.JCas; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; import de.unihd.dbs.uima.types.heideltime.Timex3; + /** - * Addition to HeidelTime to recognize several (mostly, but not - * entirely christian) holidays. + * Addition to HeidelTime to recognize several (mostly, but not entirely christian) holidays. + * * @author Hans-Peter Pfeiffer * */ public class HolidayProcessor extends GenericProcessor { + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(HolidayProcessor.class); /** * Constructor just calls the parent constructor here. @@ -29,23 +32,25 @@ public class HolidayProcessor extends GenericProcessor { public HolidayProcessor() { super(); } - /** - * not needed here + * not needed here */ public void initialize(UimaContext aContext) { return; } - + /** * all the functionality was put into evaluateCalculationFunctions(). */ public void process(JCas jcas) { evaluateCalculationFunctions(jcas); } - - + + Pattern cmd_p = Pattern.compile("((\\w\\w\\w\\w)-(\\w\\w)-(\\w\\w))\\s+funcDateCalc\\((\\w+)\\((.+)\\)\\)"); + Pattern year_p = Pattern.compile("(\\d\\d\\d\\d)"); + Pattern date_p = Pattern.compile("(\\d\\d\\d\\d)-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])"); + /** * This function replaces function calls from the resource files with their TIMEX value. * @@ -53,138 +58,87 @@ public void process(JCas jcas) { * @param jcas */ public void evaluateCalculationFunctions(JCas jcas) { + // compile regex pattern for validating commands/arguments + Matcher cmd_m = cmd_p.matcher(""); - // build up a list with all found TIMEX expressions - List linearDates = new ArrayList(); - FSIterator iterTimex = jcas.getAnnotationIndex(Timex3.type).iterator(); + AnnotationIndex timexes = jcas.getAnnotationIndex(Timex3.type); + // Avoid concurrent modification exceptions + ArrayList copy = new ArrayList(timexes.size()); + for (Timex3 timex : timexes) + if (timex.getTimexType().equals("DATE") || timex.getTimexType().equals("TIME")) + copy.add(timex); - // Create List of all Timexes of types "date" and "time" - while (iterTimex.hasNext()) { - Timex3 timex = (Timex3) iterTimex.next(); - if ((timex.getTimexType().equals("DATE")) || (timex.getTimexType().equals("TIME"))) { - linearDates.add(timex); - } - } - - - ////////////////////////////////////////////// - // go through list of Date and Time timexes // - ////////////////////////////////////////////// - //compile regex pattern for validating commands/arguments - Pattern cmd_p = Pattern.compile("((\\w\\w\\w\\w)-(\\w\\w)-(\\w\\w))\\s+funcDateCalc\\((\\w+)\\((.+)\\)\\)"); - Pattern year_p = Pattern.compile("(\\d\\d\\d\\d)"); - Pattern date_p = Pattern.compile("(\\d\\d\\d\\d)-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])"); - Matcher cmd_m; - Matcher year_m; - Matcher date_m; - String date; - String year; - String month; - String day; - String function; - String args[]; - String valueNew; - - for (int i = 0; i < linearDates.size(); i++) { - Timex3 t_i = (Timex3) linearDates.get(i); - String value_i = t_i.getTimexValue(); - cmd_m = cmd_p.matcher(value_i); - valueNew = value_i; - - if(cmd_m.matches()) { - date = cmd_m.group(1); - year = cmd_m.group(2); - month = cmd_m.group(3); - day = cmd_m.group(4); - function = cmd_m.group(5); - args = cmd_m.group(6).split("\\s*,\\s*"); - - //replace keywords in function with actual values - for(int j=0; j0) || (!count_itself && number <= 0)) { - if(day<=weekday) { - add = weekday - day; - } - else{ - add = weekday - day + 7; - } - } - else{ - if(day 0) || (!count_itself && number <= 0)) { + add = (day <= weekday) ? weekday - day : weekday - day + 7; + } else { + add = (day < weekday) ? weekday - day : weekday - day + 7; + } + add += ((number - 1) * 7); + return d.plusDays(add).format(FORMATTER); + } catch (DateTimeParseException e) { + LOG.error(e.getMessage(), e); + return ""; } } - - + /** * Get the date of a the first, second, third etc. weekday in a month * @@ -402,25 +290,19 @@ public String getWeekdayOfMonth(int number, int weekday, int month, int year) { return getWeekdayRelativeTo(String.format("%04d-%02d-01", year, month), weekday, number, true); } - private int getJulianDifference(int year){ - //TODO: this is not entirely correct! - int century = year/100 + 1; - if(century<18){ - return 10; - } - if(century==18){ - return 11; - } - if(century==19){ - return 12; - } - if(century==20||century == 21){ - return 13; - } - if(century==22){ - return 14; - } - return 15; - } - + private int getJulianDifference(int year) { + // FIXME: this is not entirely correct! + int century = year / 100 + 1; + if (century < 18) + return 10; + if (century == 18) + return 11; + if (century == 19) + return 12; + if (century == 20 || century == 21) + return 13; + if (century == 22) + return 14; + return 15; + } } diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/processors/ProcessorInitializationException.java b/src/de/unihd/dbs/uima/annotator/heideltime/processors/ProcessorInitializationException.java index 46cd8134..d368c161 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/processors/ProcessorInitializationException.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/processors/ProcessorInitializationException.java @@ -3,10 +3,8 @@ import de.unihd.dbs.uima.annotator.heideltime.HeidelTimeException; public class ProcessorInitializationException extends HeidelTimeException { - /** - * + * Serialization version */ private static final long serialVersionUID = -4036889037291484936L; - } diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/processors/ProcessorProcessingException.java b/src/de/unihd/dbs/uima/annotator/heideltime/processors/ProcessorProcessingException.java index 315f9b2d..17dbc6c2 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/processors/ProcessorProcessingException.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/processors/ProcessorProcessingException.java @@ -3,10 +3,8 @@ import de.unihd.dbs.uima.annotator.heideltime.HeidelTimeException; public class ProcessorProcessingException extends HeidelTimeException { - /** - * + * Serialization version */ private static final long serialVersionUID = 6123306006146166368L; - } diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/processors/TemponymPostprocessing.java b/src/de/unihd/dbs/uima/annotator/heideltime/processors/TemponymPostprocessing.java index 67d0c3c1..fcb0fa3f 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/processors/TemponymPostprocessing.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/processors/TemponymPostprocessing.java @@ -1,81 +1,77 @@ package de.unihd.dbs.uima.annotator.heideltime.processors; import java.util.HashSet; -import java.util.regex.MatchResult; +import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.jcas.JCas; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Toolbox; import de.unihd.dbs.uima.types.heideltime.Timex3; import de.unihd.dbs.uima.types.heideltime.Timex3Interval; /** + * This class removes TIMEX3 annotations for temponyms and adds TIMEX3INTERVAL annotations containing (earliest|latest)(Begin|End) information. * - * This class removes TIMEX3 annotations for temponyms and adds - * TIMEX3INTERVAL annotations containing (earliest|latest)(Begin|End) information. * @author jannik stroetgen - * */ public class TemponymPostprocessing { - - public static void handleIntervals(JCas jcas){ - + private static final Logger LOG = LoggerFactory.getLogger(TemponymPostprocessing.class); + + private static final Pattern p = Pattern.compile("\\[(.*?), (.*?), (.*?), (.*?)\\]"); + + public static void handleIntervals(JCas jcas) { HashSet timexes = new HashSet<>(); - + + Matcher mr = p.matcher(""); // iterate over all TEMPONYMS - FSIterator iterTimex = jcas.getAnnotationIndex(Timex3.type).iterator(); - while (iterTimex.hasNext()) { - Timex3 t = (Timex3) iterTimex.next(); - if (t.getTimexType().equals("TEMPONYM")) { - - // create a timex3interval for each temponym - Timex3Interval ti = new Timex3Interval(jcas); + AnnotationIndex timex3s = jcas.getAnnotationIndex(Timex3.type); + for (Timex3 t : timex3s) { + if (!t.getTimexType().equals("TEMPONYM")) + continue; + LOG.debug("TEMPONYM: {}", t.getCoveredText()); + // create a timex3interval for each temponym + Timex3Interval ti = new Timex3Interval(jcas); - System.err.println("TEMPONYM: " + t.getCoveredText()); - - ti.setBegin(t.getBegin()); - ti.setEnd(t.getEnd()); - ti.setTimexType(t.getTimexType()); - ti.setAllTokIds(t.getAllTokIds()); - ti.setTimexFreq(t.getTimexFreq()); - ti.setTimexMod(t.getTimexMod()); - ti.setTimexQuant(t.getTimexQuant()); - // set a new id - String id = t.getTimexId(); - int newId = Integer.parseInt(id.replace("t", "")); - newId += 100000; - ti.setTimexId("t" + newId); + ti.setBegin(t.getBegin()); + ti.setEnd(t.getEnd()); + ti.setTimexType(t.getTimexType()); + ti.setAllTokIds(t.getAllTokIds()); + ti.setTimexFreq(t.getTimexFreq()); + ti.setTimexMod(t.getTimexMod()); + ti.setTimexQuant(t.getTimexQuant()); + // set a new id + String id = t.getTimexId(); + int newId = Integer.parseInt(id.replace("t", "")); + newId += 100000; + ti.setTimexId("t" + newId); - // get the (earliest|last)(begin|end) information - Pattern p = Pattern.compile("\\[(.*?), (.*?), (.*?), (.*?)\\]"); - for (MatchResult mr : Toolbox.findMatches(p,t.getTimexValue())) { - ti.setTimexValueEB(mr.group(1)); - ti.setTimexValueLB(mr.group(2)); - ti.setTimexValueEE(mr.group(3)); - ti.setTimexValueLE(mr.group(4)); - } - //System.err.println("temponym: " + t.getTimexValue()); - if ((ti.getTimexValueEB() == ti.getTimexValueLB()) && - (ti.getTimexValueLB() == ti.getTimexValueEE()) && - (ti.getTimexValueEE() == ti.getTimexValueLE())) { - ti.setTimexValue(ti.getTimexValueEB()); - t.setTimexValue(ti.getTimexValueEB()); - } - else { // what's the best single value for an interval!? - t.setEmptyValue(t.getTimexValue()); - ti.setTimexValue(ti.getTimexValueLE()); - t.setTimexValue(ti.getTimexValueLE()); - } - ti.setFoundByRule(t.getFoundByRule()); - ti.addToIndexes(); - timexes.add(t); + // get the (earliest|last)(begin|end) information + for (mr.reset(t.getTimexValue()); mr.find();) { + ti.setTimexValueEB(mr.group(1)); + ti.setTimexValueLB(mr.group(2)); + ti.setTimexValueEE(mr.group(3)); + ti.setTimexValueLE(mr.group(4)); } + // System.err.println("temponym: " + t.getTimexValue()); + if (ti.getTimexValueEB().equals(ti.getTimexValueLB()) && // + ti.getTimexValueLB().equals(ti.getTimexValueEE()) && // + ti.getTimexValueEE().equals(ti.getTimexValueLE())) { + ti.setTimexValue(ti.getTimexValueEB()); + t.setTimexValue(ti.getTimexValueEB()); + } else { // what's the best single value for an interval!? + t.setEmptyValue(t.getTimexValue()); + ti.setTimexValue(ti.getTimexValueLE()); + t.setTimexValue(ti.getTimexValueLE()); + } + ti.setFoundByRule(t.getFoundByRule()); + ti.addToIndexes(); + timexes.add(t); } // shall the standard timexes really be removed? - for (Timex3 t : timexes){ + for (Timex3 t : timexes) t.removeFromIndexes(); - } } } diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/GenericResourceManager.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/GenericResourceManager.java index 9eb233b7..29f7a0f1 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/resources/GenericResourceManager.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/GenericResourceManager.java @@ -1,32 +1,77 @@ package de.unihd.dbs.uima.annotator.heideltime.resources; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + /** * - * Abstract class for all Resource Managers to inherit from. Contains basic - * functionality such as file system access and some private members. + * Abstract class for all Resource Managers to inherit from. Contains basic functionality such as file system access and some private members. * */ public abstract class GenericResourceManager { // language for the utilized resources - protected String LANGUAGE; + protected final String LANGUAGE; // kind of resource -- e.g. repattern, normalization, rules protected String resourceType; - // local package for logging output - protected Class component; - + /** * Instantiates the Resource Manager with a resource type - * @param resourceType kind of resource to represent + * + * @param resourceType + * kind of resource to represent */ protected GenericResourceManager(String resourceType, String language) { this.resourceType = resourceType; this.LANGUAGE = language; - this.component = this.getClass(); } - - protected String replaceSpaces(String inText) { - String outText = inText.replaceAll(" ", "[\\\\u2000-\\\\u200A \\\\u202F\\\\u205F\\\\u3000\\\\u00A0\\\\u1680\\\\u180E]+"); - - return outText; + + private static final Pattern WHITESPACE = Pattern.compile("(?: |\\\\[sS])"); + + public static String replaceSpaces(String inText) { + Matcher m = WHITESPACE.matcher(inText); + if (!m.find()) + return inText; + final int len = inText.length(); + StringBuilder buf = new StringBuilder(); + int lastpos = 0; + do { + int start = m.start(), end = m.end(); + final char lastchar = inText.charAt(end - 1); + assert (lastchar == ' ' || lastchar == 's' || lastchar == 'S'); + boolean negative = lastchar == 'S'; + boolean chargroup = false; + String extra = "+"; // By default, insert a plus. + if (end < len) { + char next = inText.charAt(end); + if (next == '?' || next == '*' || next == '+' || next == '{') + extra = null; // Preserve + if (next == ']' && start > 0 && inText.charAt(start - 1) == '[') { + + } + } + for (int s = end; s < len; s++) { + char next = inText.charAt(s); + if (next == '[' && inText.charAt(s - 1) != '\\') + break; // Supposedly not in a character group. + if (next == ']' && inText.charAt(s - 1) != '\\') { + chargroup = true; + break; + } + } + buf.append(inText, lastpos, start); + if (chargroup) { + // buf.append(negative ? "\\P{javaWhitespace}" : "\\p{javaWhitespace}"); + buf.append(negative ? "\\S" : "\\s"); + } else { + // buf.append(negative ? "[\\P{javaWhitespace}]" : "[\\p{javaWhitespace}]"); + buf.append(negative ? "\\S" : "\\s"); + if (extra != null) + buf.append(extra); + } + lastpos = end; + } while (m.find()); + if (lastpos < len) + buf.append(inText, lastpos, len); + return buf.toString(); } } diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/Language.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/Language.java index 79f9c6c2..11637dfb 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/resources/Language.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/Language.java @@ -1,6 +1,6 @@ package de.unihd.dbs.uima.annotator.heideltime.resources; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; +import org.slf4j.LoggerFactory; /** * Hardcoded Language information for use with HeidelTime/Standalone. Contains @@ -61,7 +61,7 @@ public enum Language { */ public final static Language getLanguageFromString(String name) { if(name == null) { - Logger.printError("Language parameter was specified as NULL."); + LoggerFactory.getLogger(Language.class).error("Language parameter was specified as NULL."); throw new NullPointerException(); } @@ -99,6 +99,10 @@ public final String getResourceFolder() { return this.resourceFolder; } + public final boolean useLowercase() { + return this != ARABIC; + } + @Override public String toString() { return getName(); diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/NormalizationManager.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/NormalizationManager.java index 2a6a96c8..ccdf7fd8 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/resources/NormalizationManager.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/NormalizationManager.java @@ -5,22 +5,24 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.util.HashMap; -import java.util.regex.MatchResult; +import java.util.regex.Matcher; import java.util.regex.Pattern; -import de.unihd.dbs.uima.annotator.heideltime.utilities.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + /** - * * This class fills the role of a manager of all the Normalization resources. * It reads the data from a file system and fills up a bunch of HashMaps * with their information. + * * @author jannik stroetgen - * */ public class NormalizationManager extends GenericResourceManager { + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(NormalizationManager.class); + protected static HashMap instances = new HashMap(); - // PATTERNS TO READ RESOURCES "RULES" AND "NORMALIZATION" - private Pattern paReadNormalizations = Pattern.compile("\"(.*?)\",\"(.*?)\""); // STORE PATTERNS AND NORMALIZATIONS private HashMap> hmAllNormalization; @@ -32,6 +34,8 @@ public class NormalizationManager extends GenericResourceManager { private HashMap normMonthInSeason; private HashMap normMonthInQuarter; + private String[] normNumbers; + /** * Constructor calls the parent constructor that sets language/resource parameters, * initializes basic and collects resource normalization patterns. @@ -60,9 +64,8 @@ private NormalizationManager(String language, Boolean load_temponym_resources) { ResourceScanner rs = ResourceScanner.getInstance(); ResourceMap hmResourcesNormalization = rs.getNormalizations(language); - for (String which : hmResourcesNormalization.keySet()) { + for (String which : hmResourcesNormalization.keySet()) hmAllNormalization.put(which, new RegexHashMap()); - } readNormalizationResources(hmResourcesNormalization, load_temponym_resources); } @@ -72,12 +75,12 @@ private NormalizationManager(String language, Boolean load_temponym_resources) { * @return singleton instance of NormalizationManager */ public static NormalizationManager getInstance(Language language, Boolean load_temponym_resources) { - if(!instances.containsKey(language.getName())) { - NormalizationManager nm = new NormalizationManager(language.getResourceFolder(), load_temponym_resources); + NormalizationManager nm = instances.get(language.getName()); + if(nm == null) { + nm = new NormalizationManager(language.getResourceFolder(), load_temponym_resources); instances.put(language.getName(), nm); } - - return instances.get(language.getName()); + return nm; } /** @@ -87,61 +90,40 @@ public static NormalizationManager getInstance(Language language, Boolean load_t * @param load_temponym_resources whether temponym resources are loaded */ public void readNormalizationResources(ResourceMap hmResourcesNormalization, Boolean load_temponym_resources) { + // PATTERNS TO READ RESOURCES "RULES" AND "NORMALIZATION" + Matcher maReadNormalizations = Pattern.compile("\"(.*?)\",\"(.*?)\"").matcher(""); + for (String resource : hmResourcesNormalization.keySet()) { + // read normalization resources with "Temponym" only if temponym tagging is selected + if (resource.contains("Temponym") && + !(load_temponym_resources && resource.contains("Temponym"))) { + LOG.trace("No Temponym tagging selected. Skipping normalization resource: {}", resource); + continue; + } + LOG.debug("Adding normalization resource: {}", resource); + // create a buffered reader for every normalization resource file + try(InputStream is = hmResourcesNormalization.getInputStream(resource); // + InputStreamReader isr = new InputStreamReader(is, "UTF-8");// + BufferedReader br = new BufferedReader(isr)) { + for (String line; (line=br.readLine()) != null; ) { + if (line.startsWith("//") || line.length() == 0) continue; // ignore comments and empty lines - InputStream is = null; - InputStreamReader isr = null; - BufferedReader br = null; - try { - for (String resource : hmResourcesNormalization.keySet()) { - // read normalization resources with "Temponym" only if temponym tagging is selected - if ( (!(resource.contains("Temponym"))) || - ((load_temponym_resources) && (resource.contains("Temponym")))){ - - Logger.printDetail(component, "Adding normalization resource: "+resource); - // create a buffered reader for every normalization resource file - is = hmResourcesNormalization.getInputStream(resource); - isr = new InputStreamReader(is, "UTF-8"); - br = new BufferedReader(isr); - for ( String line; (line=br.readLine()) != null; ) { - if (line.startsWith("//")) continue; // ignore comments - - // check each line for the normalization format (defined in paReadNormalizations) - boolean correctLine = false; - for (MatchResult r : Toolbox.findMatches(paReadNormalizations, line)) { - correctLine = true; - String resource_word = replaceSpaces(r.group(1)); - String normalized_word = r.group(2); - for (String which : hmAllNormalization.keySet()) { - if (resource.equals(which)) { - hmAllNormalization.get(which).put(resource_word,normalized_word); - } - } - if ((correctLine == false) && (!(line.matches("")))) { - Logger.printError("["+component+"] Cannot read one of the lines of normalization resource "+resource); - Logger.printError("["+component+"] Line: "+line); - } + // check each line for the normalization format (defined in paReadNormalizations) + maReadNormalizations.reset(line); + if (!maReadNormalizations.find()) { + LOG.error("Cannot read one of the lines of normalization resource {}\nLine: {}", resource, line); + continue; + } + String resource_word = maReadNormalizations.group(1); + String normalized_word = maReadNormalizations.group(2); + for (String which : hmAllNormalization.keySet()) { + if (resource.equals(which)) { + hmAllNormalization.get(which).put(resource_word, normalized_word); } } } - else { - Logger.printDetail(component, "No Temponym Tagging selected. Skipping normalization resource: "+resource); - } - } - } catch (IOException e) { - e.printStackTrace(); - } finally { - try { - if(br != null) { - br.close(); - } - if(isr != null) { - isr.close(); - } - if(is != null) { - is.close(); - } - } catch(Exception e) { - e.printStackTrace(); + } catch (IOException e) { + LOG.error(e.getMessage(), e); + System.exit(1); } } } @@ -151,7 +133,6 @@ public void readNormalizationResources(ResourceMap hmResourcesNormalization, Boo * sets a couple of rudimentary normalization parameters */ private void readGlobalNormalizationInformation() { - // MONTH IN QUARTER normMonthInQuarter.put("01","1"); normMonthInQuarter.put("02","1"); @@ -167,7 +148,7 @@ private void readGlobalNormalizationInformation() { normMonthInQuarter.put("12","4"); // MONTH IN SEASON - normMonthInSeason.put("", ""); + normMonthInSeason.put("", ""); // FIXME: why? normMonthInSeason.put("01","WI"); normMonthInSeason.put("02","WI"); normMonthInSeason.put("03","SP"); @@ -182,34 +163,34 @@ private void readGlobalNormalizationInformation() { normMonthInSeason.put("12","WI"); // DAY IN WEEK - normDayInWeek.put("sunday","1"); - normDayInWeek.put("monday","2"); - normDayInWeek.put("tuesday","3"); - normDayInWeek.put("wednesday","4"); - normDayInWeek.put("thursday","5"); - normDayInWeek.put("friday","6"); - normDayInWeek.put("saturday","7"); - normDayInWeek.put("Sunday","1"); - normDayInWeek.put("Monday","2"); - normDayInWeek.put("Tuesday","3"); - normDayInWeek.put("Wednesday","4"); - normDayInWeek.put("Thursday","5"); - normDayInWeek.put("Friday","6"); - normDayInWeek.put("Saturday","7"); -// normDayInWeek.put("sunday","7"); -// normDayInWeek.put("monday","1"); -// normDayInWeek.put("tuesday","2"); -// normDayInWeek.put("wednesday","3"); -// normDayInWeek.put("thursday","4"); -// normDayInWeek.put("friday","5"); -// normDayInWeek.put("saturday","6"); -// normDayInWeek.put("Sunday","7"); -// normDayInWeek.put("Monday","1"); -// normDayInWeek.put("Tuesday","2"); -// normDayInWeek.put("Wednesday","3"); -// normDayInWeek.put("Thursday","4"); -// normDayInWeek.put("Friday","5"); -// normDayInWeek.put("Saturday","6"); +// normDayInWeek.put("sunday","1"); +// normDayInWeek.put("monday","2"); +// normDayInWeek.put("tuesday","3"); +// normDayInWeek.put("wednesday","4"); +// normDayInWeek.put("thursday","5"); +// normDayInWeek.put("friday","6"); +// normDayInWeek.put("saturday","7"); +// normDayInWeek.put("Sunday","1"); +// normDayInWeek.put("Monday","2"); +// normDayInWeek.put("Tuesday","3"); +// normDayInWeek.put("Wednesday","4"); +// normDayInWeek.put("Thursday","5"); +// normDayInWeek.put("Friday","6"); +// normDayInWeek.put("Saturday","7"); + normDayInWeek.put("sunday","7"); + normDayInWeek.put("monday","1"); + normDayInWeek.put("tuesday","2"); + normDayInWeek.put("wednesday","3"); + normDayInWeek.put("thursday","4"); + normDayInWeek.put("friday","5"); + normDayInWeek.put("saturday","6"); + normDayInWeek.put("Sunday","7"); + normDayInWeek.put("Monday","1"); + normDayInWeek.put("Tuesday","2"); + normDayInWeek.put("Wednesday","3"); + normDayInWeek.put("Thursday","4"); + normDayInWeek.put("Friday","5"); + normDayInWeek.put("Saturday","6"); // NORM MINUTE @@ -285,6 +266,12 @@ private void readGlobalNormalizationInformation() { normNumber.put("59","59"); normNumber.put("60","60"); + normNumbers = new String[61]; + for (int i = 0; i < 10; i++) + normNumbers[i] = "0"+i; + for (int i = 10; i <= 60; i++) + normNumbers[i] = Integer.toString(i); + // NORM MONTH normMonthName.put("january","01"); normMonthName.put("february","02"); @@ -310,6 +297,10 @@ public final String getFromNormNumber(String key) { return normNumber.get(key); } + public final String normNumber(int key) { + return key >= 0 && key <= 60 ? normNumbers[key] : null; + } + public final String getFromNormDayInWeek(String key) { return normDayInWeek.get(key); } diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/RePatternManager.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/RePatternManager.java index c04388d0..a5ea5914 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/resources/RePatternManager.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/RePatternManager.java @@ -4,30 +4,42 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; -import java.util.Comparator; import java.util.HashMap; -import java.util.LinkedList; +import java.util.List; import java.util.TreeMap; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import de.unihd.dbs.uima.annotator.heideltime.utilities.RegexpOptimizer; +import de.unihd.dbs.uima.annotator.heideltime.utilities.RegexpOptimizer.OptimizerException; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; /** * - * This class fills the role of a manager of all the RePattern resources. - * It reads the data from a file system and fills up a bunch of HashMaps - * with their information. + * This class fills the role of a manager of all the RePattern resources. It reads the data from a file system and fills up a bunch of HashMaps with their information. + * * @author jannik stroetgen * */ public class RePatternManager extends GenericResourceManager { + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(RePatternManager.class); + protected static HashMap instances = new HashMap(); - + // STORE PATTERNS AND NORMALIZATIONS private TreeMap hmAllRePattern; + private HashMap compiled; + /** - * Constructor calls the parent constructor that sets language/resource - * parameters and collects resource repatterns. + * Constructor calls the parent constructor that sets language/resource parameters and collects resource repatterns. + * * @param language * @param load_temponym_resources */ @@ -36,6 +48,7 @@ private RePatternManager(String language, Boolean load_temponym_resources) { super("repattern", language); // initialize the member map of all repatterns hmAllRePattern = new TreeMap(); + compiled = new HashMap(); ////////////////////////////////////////////////////// // READ PATTERN RESOURCES FROM FILES AND STORE THEM // @@ -50,145 +63,134 @@ private RePatternManager(String language, Boolean load_temponym_resources) { /** * singleton producer. + * * @return singleton instance of RePatternManager */ public static RePatternManager getInstance(Language language, Boolean load_temponym_resources) { - if(!instances.containsKey(language.getName())) { + if (!instances.containsKey(language.getName())) { RePatternManager nm = new RePatternManager(language.getResourceFolder(), load_temponym_resources); instances.put(language.getName(), nm); } - + return instances.get(language.getName()); } - - + /** * READ THE REPATTERN FROM THE FILES. The files have to be defined in the HashMap hmResourcesRePattern. - * @param hmResourcesRePattern RePattern resources to be interpreted - * @param load_temponym_resources whether temponym resources are to be read + * + * @param hmResourcesRePattern + * RePattern resources to be interpreted + * @param load_temponym_resources + * whether temponym resources are to be read */ - private void readRePatternResources(ResourceMap hmResourcesRePattern, Boolean load_temponym_resources) { - + private void readRePatternResources(ResourceMap hmResourcesRePattern, boolean load_temponym_resources) { ////////////////////////////////////// // READ REGULAR EXPRESSION PATTERNS // ////////////////////////////////////// - InputStream is = null; - InputStreamReader isr = null; - BufferedReader br = null; - try { - for (String resource : hmResourcesRePattern.keySet()) { - // read pattern resources with "Temponym" only if temponym tagging is selected - if ( (!(resource.contains("Temponym"))) || - ((load_temponym_resources) && (resource.contains("Temponym")))){ - Logger.printDetail(component, "Adding pattern resource: "+resource); - // create a buffered reader for every repattern resource file - is = hmResourcesRePattern.getInputStream(resource); - isr = new InputStreamReader(is, "UTF-8"); - br = new BufferedReader(isr); - LinkedList patterns = new LinkedList(); - for (String line; (line = br.readLine()) != null; ) { - // disregard comments - if (!line.startsWith("//") && !line.equals("")) { - patterns.add(replaceSpaces(line)); - } - } - - - - // sort the repatterns by length in ascending order - Collections.sort(patterns, new Comparator() { - @Override - public int compare(String o1, String o2) { - String o1effective = o1.replaceAll("\\[[^\\]]*\\]", "X") - .replaceAll("\\?", "") - .replaceAll("\\\\.(?:\\{([^\\}])+\\})?", "X$1"); - String o2effective = o2.replaceAll("\\[[^\\]]*\\]", "X") - .replaceAll("\\?", "") - .replaceAll("\\\\.(?:\\{([^\\}])+\\})?", "X$1"); - - if(o1effective.length() < o2effective.length()) - return 1; - else if(o1effective.length() > o2effective.length()) - return -1; - else - return 0; - } - }); - - StringBuilder sb = new StringBuilder(); - String devPattern = ""; - for(String pat : patterns) { - sb.append("|"); - sb.append(pat); - } - devPattern = sb.toString(); - hmAllRePattern.put(resource, devPattern); - } - else { - Logger.printDetail(component, "No Temponym Tagging selected. Skipping pattern resource: "+resource); - } - } - //////////////////////////// - // FINALIZE THE REPATTERN // - //////////////////////////// - for (String which : hmAllRePattern.keySet()) { - if ( (!(which.contains("Temponym"))) || - ((load_temponym_resources) && (which.contains("Temponym")))){ - finalizeRePattern(which, hmAllRePattern.get(which)); - } + for (String resource : hmResourcesRePattern.keySet()) { + // read pattern resources with "Temponym" only if temponym tagging is selected + if (!load_temponym_resources && resource.contains("Temponym")) { + LOG.trace("No Temponym tagging selected. Skipping pattern resource: {}", resource); + continue; } - } catch (IOException e) { - e.printStackTrace(); - } finally { - try { - if(br != null) { - br.close(); - } - if(isr != null) { - isr.close(); - } - if(is != null) { - is.close(); - } - } catch(Exception e) { - e.printStackTrace(); + LOG.debug("Adding pattern resource: {}", resource); + // create a buffered reader for every repattern resource file + try (InputStream is = hmResourcesRePattern.getInputStream(resource); // + InputStreamReader isr = new InputStreamReader(is, "UTF-8"); // + BufferedReader br = new BufferedReader(isr)) { + List patterns = new ArrayList(); + for (String line; (line = br.readLine()) != null;) + // disregard comments + if (!line.startsWith("//") && !line.equals("")) + patterns.add(line); + patterns = optimizePatterns(resource, patterns); + hmAllRePattern.put(resource, String.join("|", patterns)); + } catch (IOException e) { + LOG.error(e.getMessage(), e); } } } - + /** - * Pattern containing regular expression is finalized, i.e., created correctly and added to hmAllRePattern. - * @param name key name - * @param rePattern repattern value + * Optimize a set of patterns into a more efficient regexp, because of Java. + * + * @author Erich Schubert + * @param inpatterns + * Input patterns + * @return Optimized regular expression set */ - private void finalizeRePattern(String name, String rePattern) { - // create correct regular expression - rePattern = rePattern.replaceFirst("\\|", ""); - /* this was added to reduce the danger of getting unusable groups from user-made repattern - * files with group-producing parentheses (i.e. "(foo|bar)" while matching against the documents. */ - rePattern = rePattern.replaceAll("\\(([^\\?])", "(?:$1"); - rePattern = "(" + rePattern + ")"; - rePattern = rePattern.replaceAll("\\\\", "\\\\\\\\"); - // add rePattern to hmAllRePattern - hmAllRePattern.put(name, rePattern); + public static List optimizePatterns(CharSequence name, List inpatterns) { + // Since we already have some rules written as res, + // We try to expand some basic constructs first. + try { + ArrayList expanded = new ArrayList<>(); + for (String s : inpatterns) { + try { + RegexpOptimizer.expandPatterns(s, x -> expanded.add(x.toString())); + } catch (OptimizerException e) { + // More specific message than below. + LOG.warn("Pattern '{}' for '{}' contains a too complex regexp construct, cannot optimize: {}", s, name, e.getMessage()); + return inpatterns; + } + } + if (expanded.isEmpty()) { + LOG.info("Regexp pattern {} is empty.", name); + return Collections.emptyList(); + } + String pattern = RegexpOptimizer.combinePatterns(expanded); + LOG.trace("Combined {} into: {}", name, pattern); + return Arrays.asList(pattern); + } catch (OptimizerException e) { + LOG.warn("Pattern '{}' contains a too complex regexp construct, cannot optimize: {}", name, e.getMessage()); + return inpatterns; + } } - + /** * proxy method to access the hmAllRePattern member - * @param key key to check for + * + * @param key + * key to check for * @return whether the map contains the key */ - public Boolean containsKey(String key) { + public boolean containsKey(String key) { return hmAllRePattern.containsKey(key); } /** * proxy method to access the hmAllRePattern member - * @param key Key to retrieve data from + * + * @param key + * Key to retrieve data from * @return String from the map */ public String get(String key) { return hmAllRePattern.get(key); } + /** + * proxy method to access the compiled hmAllRePattern member + * + * @param key + * Key to retrieve data from + * @return String from the map + */ + public Pattern getCompiled(String key) { + Pattern p = compiled.get(key); + if (p != null) + return p; + String rePattern = hmAllRePattern.get(key); + try { + Pattern c = Pattern.compile(rePattern); + int groupcount = c.matcher("").groupCount(); + if (groupcount != 0) + LOG.error("rePattern {} contains unexpected groups: {}\nPattern: {}", key, groupcount - 1, rePattern); + compiled.put(key, c); + return c; + } catch (PatternSyntaxException e) { + LOG.error("Failed to compile RePattern {}:\n{}", key, rePattern); + throw e; + } + } + } diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/RegexHashMap.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/RegexHashMap.java index 8ffe5987..1a11e795 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/resources/RegexHashMap.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/RegexHashMap.java @@ -3,7 +3,6 @@ import java.util.Collection; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; @@ -12,12 +11,10 @@ * Implements a HashMap extended with regular expression keys and caching functionality. * * @author Julian Zell - * */ public class RegexHashMap implements Map { - - private HashMap container = new HashMap(); - private HashMap cache = new HashMap(); + private HashMap container = new HashMap<>(); + private HashMap cache = new HashMap<>(); /** * clears both the container and the cache hashmaps @@ -32,6 +29,8 @@ public void clear() { * container's keys as regexes and checks whether they match the specific key. */ public boolean containsKey(Object key) { + if (!(key instanceof String)) + return false; // the key is a direct hit from our cache if(cache.containsKey(key)) return true; @@ -39,12 +38,11 @@ public boolean containsKey(Object key) { if(container.containsKey(key)) return true; + String str = (String) key; // check if the requested key is a matching string of a regex key from our container - Iterator regexKeys = container.keySet().iterator(); - while(regexKeys.hasNext()) { - if(Pattern.matches(regexKeys.next(), (String) key)) + for(String regexKey : container.keySet()) + if(Pattern.matches(regexKey, str)) return true; - } // if the three previous tests yield no result, the key does not exist return false; @@ -70,7 +68,7 @@ public boolean containsValue(Object value) { */ public Set> entrySet() { // prepare the container - HashSet> set = new HashSet>(); + HashSet> set = new HashSet<>(); // add the set from our container set.addAll(container.entrySet()); // add the set from our cache @@ -88,26 +86,24 @@ public Set> entrySet() { */ public T get(Object key) { // output for requested key null is the value null; normal Map behavior - if(key == null) return null; - + if(!(key instanceof String)) return null; + T result = null; - if((result = cache.get(key)) != null) { - // if the requested key maps to a value in the cache + // if the requested key maps to a value in the cache + if((result = cache.get(key)) != null) return result; - } else if((result = container.get(key)) != null) { - // if the requested key maps to a value in the container + + // if the requested key maps to a value in the container + if((result = container.get(key)) != null) return result; - } else { - // check if the requested key is a matching string of a regex key from our container - Iterator> regexKeys = container.entrySet().iterator(); - while(regexKeys.hasNext()) { - // prepare current entry - Entry entry = regexKeys.next(); - // check if the key is a regex matching the input key - if(Pattern.matches(entry.getKey(), (String) key)) { - putCache((String) key, entry.getValue()); - return entry.getValue(); - } + + // check if the requested key is a matching string of a regex key from our container + String str = (String) key; + for (Entry entry : container.entrySet()) { + // check if the key is a regex matching the input key + if(Pattern.matches(entry.getKey(), str)) { + putCache(str, entry.getValue()); + return entry.getValue(); } } @@ -127,7 +123,7 @@ public boolean isEmpty() { */ public Set keySet() { // prepare container - HashSet set = new HashSet(); + HashSet set = new HashSet<>(); // add container keys set.addAll(container.keySet()); // add cache keys diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/ResourceMap.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/ResourceMap.java index 660f8ef8..19e86108 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/resources/ResourceMap.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/ResourceMap.java @@ -11,7 +11,7 @@ import java.util.Set; import java.util.TreeSet; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; +import org.slf4j.LoggerFactory; public class ResourceMap implements Map { HashMap outerFiles = new HashMap(); @@ -52,7 +52,7 @@ public InputStream getInputStream(String key) { try { is = new FileInputStream(outerFiles.get(key)); } catch(FileNotFoundException e) { - Logger.printError("File " + key + " disppeared while loading resources."); + LoggerFactory.getLogger(ResourceMap.class).error("File " + key + " disppeared while loading resources."); } } else if(innerFiles.containsKey(key)) { is = this.getClass().getClassLoader().getResourceAsStream(innerFiles.get(key)); diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/ResourceScanner.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/ResourceScanner.java index a71bcb3b..5b31e1d1 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/resources/ResourceScanner.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/ResourceScanner.java @@ -15,9 +15,13 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class ResourceScanner { + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(ResourceScanner.class); + private static ResourceScanner INSTANCE = null; /** @@ -100,12 +104,10 @@ private ResourceScanner() { public static void main(String[] args) { - @SuppressWarnings("unused") - ResourceScanner rs = null; try { - rs = new ResourceScanner(); + new ResourceScanner(); } catch (Exception e) { - e.printStackTrace(); + LOG.error(e.getMessage(), e); } } @@ -125,7 +127,7 @@ private void scanValidInsideResourcesFolder(HashMap jarContent Pattern rulePattern = Pattern.compile(language + "/rules/resources_rules_(.+)\\.txt$"); if (entry.getValue().isDirectory()) { - Logger.printDetail(ResourceScanner.class, "Testing " + entry.getKey()); + LOG.trace("Testing {}", entry.getKey()); /* * our conditions for something being a resources folder: the resource * folder must contain at least the following folders: @@ -136,9 +138,9 @@ private void scanValidInsideResourcesFolder(HashMap jarContent * |- rules */ - Boolean repatternExists = false; - Boolean normalizationExists = false; - Boolean ruleExists = false; + boolean repatternExists = false; + boolean normalizationExists = false; + boolean ruleExists = false; for(String entryName : jarContents.keySet()) { if(!repatternExists && repatternPattern.matcher(entryName).matches()) { @@ -153,11 +155,11 @@ private void scanValidInsideResourcesFolder(HashMap jarContent } if(!repatternExists || !normalizationExists || !ruleExists) { - Logger.printDetail(ResourceScanner.class, "We need at least one readable resource file of each type to run."); + LOG.debug("We need at least one readable resource file of each type to run (in {})", entry.getKey()); continue; } - Logger.printDetail(ResourceScanner.class, "Valid resource folder."); + LOG.trace("Valid resource folder: {}", entry.getKey()); // at this point, the folder is obviously a language resource folder => collect streams this.repatterns.put(language, new ResourceMap()); @@ -191,10 +193,10 @@ private void scanValidOutsideResourcesFolder(File resourcePath) { for (File supposedLanguagePath : pathContents) { String language = supposedLanguagePath.getName(); if (supposedLanguagePath.isDirectory()) { - Logger.printDetail(ResourceScanner.class, "Testing " + supposedLanguagePath.getAbsolutePath()); + LOG.trace("Testing {}", supposedLanguagePath); if (!supposedLanguagePath.exists()) { - Logger.printDetail(ResourceScanner.class, "This path doesn't exist."); + LOG.debug("This path doesn't exist."); continue; } @@ -215,8 +217,7 @@ private void scanValidOutsideResourcesFolder(File resourcePath) { if (!repatternFolder.exists() || !repatternFolder.canRead() || !repatternFolder.isDirectory() || !normalizationFolder.exists() || !normalizationFolder.canRead() || !normalizationFolder.isDirectory() || !ruleFolder.exists() || !ruleFolder.canRead() || !ruleFolder.isDirectory()) { - Logger.printDetail(ResourceScanner.class, "We need at least the folders repattern, normalization and rules in this folder."); - + LOG.debug("We need at least the folders repattern, normalization and rules in folder '{}'.", supposedLanguagePath); continue; } @@ -244,11 +245,11 @@ public boolean accept(File arg0, String arg1) { || !repatternFiles[0].exists() || !repatternFiles[0].canRead() || !repatternFiles[0].isFile() || !normalizationFiles[0].exists() || !normalizationFiles[0].canRead() || !normalizationFiles[0].isFile() || !ruleFiles[0].exists() || !ruleFiles[0].canRead() || !ruleFiles[0].isFile()) { - Logger.printDetail(ResourceScanner.class, "We need at least one readable resource file of each type to run."); + LOG.debug("We need at least one readable resource file of each type to run in '{}'", supposedLanguagePath); continue; } - Logger.printDetail(ResourceScanner.class, "Valid resource folder."); + LOG.trace("Valid resource folder: {}", supposedLanguagePath); // at this point, the folder is obviously a language resource folder => collect streams this.repatterns.put(language, new ResourceMap()); diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/Rule.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/Rule.java new file mode 100644 index 00000000..616a3a23 --- /dev/null +++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/Rule.java @@ -0,0 +1,98 @@ +package de.unihd.dbs.uima.annotator.heideltime.resources; + +import java.util.regex.Pattern; + +/** + * Class representing a single rule. + * + * @author Erich Schubert + */ +public class Rule implements Comparable { + /** + * Constructor with mandatory parameters. + * + * @param name Name + * @param pattern Pattern + * @param normalization Normalization + */ + public Rule(String name, Pattern pattern, String normalization) { + this.name = name; + this.pattern = pattern; + this.normalization = normalization; + } + + /** Rule name */ + String name; + + /** Extraction pattern */ + Pattern pattern; + + /** Normalization */ + String normalization; + + /** Offset pattern*/ + String offset; + + /** Quant */ + String quant; + + /** Freq */ + String freq; + + /** Mod */ + String mod; + + /** Position constraint */ + String posConstratint; + + /** Empty value */ + String emptyValue; + + /** Fast check */ + Pattern fastCheck; + + public String getName() { + return name; + } + + public Pattern getPattern() { + return pattern; + } + + public String getNormalization() { + return normalization; + } + + public String getOffset() { + return offset; + } + + public String getQuant() { + return quant; + } + + public String getFreq() { + return freq; + } + + public String getMod() { + return mod; + } + + public String getPosConstratint() { + return posConstratint; + } + + public String getEmptyValue() { + return emptyValue; + } + + public Pattern getFastCheck() { + return fastCheck; + } + + @Override + public int compareTo(Rule other) { + return name.compareTo(other.name); + } +} diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/RuleExpansion.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/RuleExpansion.java new file mode 100644 index 00000000..12988302 --- /dev/null +++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/RuleExpansion.java @@ -0,0 +1,212 @@ +package de.unihd.dbs.uima.annotator.heideltime.resources; + +import java.util.regex.MatchResult; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import de.unihd.dbs.uima.annotator.heideltime.utilities.ChineseNumbers; + +/** + * HeidelTime rule expansion logic. + * + * There is some copy and paste involved in the {@code expandX} functions, but this allows the hotspot VM to optimize them independently. + * + * This should probably be integrated into the {@link Rule} class, and only some expansions are necessary. + * + * @author Erich Schubert + */ +public class RuleExpansion { + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(RuleExpansion.class); + + static Pattern paNorm = Pattern.compile("%([A-Za-z0-9]+?)\\(group\\(([0-9]+)\\)\\)"); + static Pattern paGroup = Pattern.compile("group\\(([0-9]+)\\)"); + static Pattern paSubstring = Pattern.compile("%SUBSTRING%\\((.*?),([0-9]+),([0-9]+)\\)"); + static Pattern paLowercase = Pattern.compile("%LOWERCASE%\\((.*?)\\)"); + static Pattern paUppercase = Pattern.compile("%UPPERCASE%\\((.*?)\\)"); + static Pattern paSum = Pattern.compile("%SUM%\\((.*?),(.*?)\\)"); + static Pattern paNormNoGroup = Pattern.compile("%([A-Za-z0-9]+?)\\((.*?)\\)"); + static Pattern paChineseNorm = Pattern.compile("%CHINESENUMBERS%\\((.*?)\\)"); + static Pattern WHITESPACE_NORM = Pattern.compile("[\n\\s]+"); + + public static String applyRuleFunctions(String rule, String pattern, MatchResult m, NormalizationManager norm, Language language) { + StringBuilder tonormalize = new StringBuilder(pattern); + // pattern for normalization functions + group information + // pattern for group information + Matcher mr = paNorm.matcher(tonormalize); + while (tonormalize.indexOf("%") >= 0 || tonormalize.indexOf("group") >= 0) { + // replace normalization functions + expandNormalizationGroup(tonormalize, mr, norm, m, rule); + // replace other groups + expandGroups(tonormalize, mr, m, rule); + // apply the substring function + expandSubstringFunction(tonormalize, mr, m, rule); + if (language.useLowercase()) { + expandLowerCaseFunction(tonormalize, mr); + expandUpperCaseFunction(tonormalize, mr); + } + // replace sum, concatenation + expandSumFunction(tonormalize, mr, m, rule); + // replace normalization function without group + expandNormalizationFull(tonormalize, mr, norm, rule); + // replace Chinese with Arabic numerals + replaceChineseNumerals(tonormalize, mr); + } + return tonormalize.toString(); + } + + private static void expandNormalizationGroup(StringBuilder tonormalize, Matcher mr, NormalizationManager norm, MatchResult m, String rule) { + mr.usePattern(paNorm).reset(tonormalize); + for (int pos = 0; mr.find(pos);) { + String normfunc = mr.group(1); + int start = mr.start(), end = mr.end(); + int groupid = Integer.parseInt(mr.group(2)); + if (LOG.isTraceEnabled()) { + LOG.trace("rule:" + rule); + LOG.trace("tonormalize:" + tonormalize.toString()); + LOG.trace("x.group():" + mr.group()); + LOG.trace("x.group(1):" + normfunc); + LOG.trace("x.group(2):" + mr.group(2)); + LOG.trace("m.group():" + m.group()); + LOG.trace("m.group(" + groupid + "):" + m.group(groupid)); + LOG.trace("hmR...:" + norm.getFromHmAllNormalization(normfunc).get(m.group(groupid))); + } + + if (groupid > m.groupCount()) { + LOG.error("Invalid group reference '{}' in normalization pattern of rule: {}", groupid, rule); + tonormalize.delete(start, end); + continue; + } + String value = m.group(groupid); + if (value == null) { + // This is not unusual to happen + LOG.debug("Empty part to normalize in {}, rule {}, '{}'", normfunc, rule, m.group()); + tonormalize.delete(start, end); + continue; + } + value = WHITESPACE_NORM.matcher(value).replaceAll(" "); + RegexHashMap normmap = norm.getFromHmAllNormalization(normfunc); + String rep = normmap != null ? normmap.get(value) : null; + if (rep == null) { + if (normfunc.contains("Temponym")) { + LOG.debug("Temponym '{}' normalization problem. Value: {} in " + // + "rule: {} tonormalize: {}", normfunc, value, rule, tonormalize); + tonormalize.delete(start, end); + continue; + } + LOG.warn("'{}' normalization problem. Value: {} in " + // + "rule: {} tonormalize: {}", normfunc, value, rule, tonormalize); + tonormalize.delete(start, end); + continue; + } + tonormalize.replace(start, end, rep); + pos = start + rep.length(); + } + } + + private static void expandGroups(StringBuilder tonormalize, Matcher mr, MatchResult m, String rule) { + mr.usePattern(paGroup).reset(tonormalize); + for (int pos = 0; mr.find(pos);) { + int groupid = Integer.parseInt(mr.group(1)); + int start = mr.start(), end = mr.end(); + if (groupid > m.groupCount()) { + LOG.error("Invalid group reference '{}' in normalization pattern of rule: {}", groupid, rule); + tonormalize.delete(start, end); + continue; + } + if (LOG.isTraceEnabled()) { + LOG.trace("tonormalize:" + tonormalize); + LOG.trace("x.group():" + mr.group()); + LOG.trace("x.group(1):" + mr.group(1)); + LOG.trace("m.group():" + mr.group()); + LOG.trace("m.group(" + mr.group(1) + "):" + m.group(groupid)); + } + String rep = m.group(groupid); + tonormalize.replace(start, end, rep); + pos = start + rep.length(); + } + } + + private static void expandNormalizationFull(StringBuilder tonormalize, Matcher mr, NormalizationManager norm, String rule) { + mr.usePattern(paNormNoGroup).reset(tonormalize); + int pos = 0; + while (mr.find(pos)) { + String normfunc = mr.group(1); + String value = mr.group(2); + String rep = norm.getFromHmAllNormalization(normfunc).get(value); + if (rep == null) { + LOG.warn("'{}' normalization problem. Value: {} in " + // + "rule: {} tonormalize: {}", normfunc, value, rule, tonormalize); + rep = ""; + } + int start = mr.start(), end = mr.end(); + tonormalize.replace(start, end, rep); + pos = start + rep.length(); + } + } + + private static void expandSubstringFunction(StringBuilder tonormalize, Matcher mr, MatchResult m, String rule) { + mr.usePattern(paSubstring).reset(tonormalize); + for (int pos = 0; mr.find(pos);) { + int start = mr.start(), end = mr.end(); + try { + String rep = mr.group(1).substring(Integer.parseInt(mr.group(2)), Integer.parseInt(mr.group(3))); + tonormalize.replace(start, end, rep); + pos = start + rep.length(); + } catch (StringIndexOutOfBoundsException e) { + LOG.error("Substring out of bounds: '{}' for '{}' with rule '{}'", mr.group(), m.group(), rule, e); + tonormalize.delete(start, end); + } + } + } + + private static void expandLowerCaseFunction(StringBuilder tonormalize, Matcher mr) { + mr.usePattern(paLowercase).reset(tonormalize); + for (int pos = 0; mr.find(pos);) { + String rep = mr.group(1).toLowerCase(); + int start = mr.start(), end = mr.end(); + tonormalize.replace(start, end, rep); + pos = start + rep.length(); + } + } + + private static void expandUpperCaseFunction(StringBuilder tonormalize, Matcher mr) { + mr.usePattern(paUppercase).reset(tonormalize); + for (int pos = 0; mr.find(pos);) { + String rep = mr.group(1).toUpperCase(); + int start = mr.start(), end = mr.end(); + tonormalize.replace(start, end, rep); + pos = start + rep.length(); + } + } + + private static void expandSumFunction(StringBuilder tonormalize, Matcher mr, MatchResult m, String rule) { + mr.usePattern(paSum).reset(tonormalize); + for (int pos = 0; mr.find(pos);) { + int start = mr.start(), end = mr.end(); + try { + String rep = Integer.toString(Integer.parseInt(mr.group(1)) + Integer.parseInt(mr.group(2))); + tonormalize.replace(start, end, rep); + pos = start + rep.length(); + } catch (NumberFormatException e) { + LOG.error("Failed to expand sum: '{}' for '{}' with rule '{}'", mr.group(), m.group(), rule, e); + tonormalize.delete(start, end); + } + } + } + + private static void replaceChineseNumerals(StringBuilder tonormalize, Matcher mr) { + mr.usePattern(paChineseNorm).reset(tonormalize); + for (int pos = 0; mr.find(pos);) { + String rep = ChineseNumbers.normalize(mr.group(1)); + if (rep == null) // TODO: Add a warning + continue; + int start = mr.start(), end = mr.end(); + tonormalize.replace(start, end, rep); + pos = start + rep.length(); + } + } +} \ No newline at end of file diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/RuleManager.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/RuleManager.java index 89f96d28..0e34b48a 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/resources/RuleManager.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/RuleManager.java @@ -4,106 +4,77 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.util.Collections; -import java.util.Comparator; +import java.util.ArrayList; import java.util.HashMap; -import java.util.LinkedList; -import java.util.regex.MatchResult; +import java.util.Iterator; +import java.util.List; +import java.util.regex.Matcher; import java.util.regex.Pattern; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Toolbox; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** - * - * This class fills the role of a manager of all the rule resources. It reads - * the data from a file system and fills up a bunch of HashMaps with their - * information. + * This class fills the role of a manager of all the rule resources. It reads the data from a file system and fills up a bunch of HashMaps with their information. * * @author jannik stroetgen - * */ public class RuleManager extends GenericResourceManager { - protected static HashMap instances = new HashMap(); + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(RuleManager.class); - // PATTERNS TO READ RESOURCES "RULES" AND "NORMALIZATION" - Pattern paReadRules = Pattern.compile("RULENAME=\"(.*?)\",EXTRACTION=\"(.*?)\",NORM_VALUE=\"(.*?)\"(.*)"); - - // EXTRACTION PARTS OF RULES (patterns loaded from files) - HashMap hmDatePattern = new HashMap(); - HashMap hmDurationPattern = new HashMap(); - HashMap hmTimePattern = new HashMap(); - HashMap hmSetPattern = new HashMap(); - - // NORMALIZATION PARTS OF RULES (patterns loaded from files) - HashMap hmDateNormalization = new HashMap(); - HashMap hmTimeNormalization = new HashMap(); - HashMap hmDurationNormalization = new HashMap(); - HashMap hmSetNormalization = new HashMap(); + /** Static pool */ + protected static HashMap instances = new HashMap(); - // OFFSET PARTS OF RULES (patterns loaded from files) - HashMap hmDateOffset = new HashMap(); - HashMap hmTimeOffset = new HashMap(); - HashMap hmDurationOffset = new HashMap(); - HashMap hmSetOffset = new HashMap(); + /** + * singleton producer. + * + * @return singleton instance of RuleManager + */ + public static RuleManager getInstance(Language language, boolean load_temponym_resources) { + RuleManager rm = instances.get(language.getName()); + if (rm != null) + return rm; + synchronized (RuleManager.class) { + rm = instances.get(language.getName()); + if (rm != null) + return rm; + rm = new RuleManager(language.getResourceFolder(), load_temponym_resources); + instances.put(language.getName(), rm); + return rm; + } + } - // QUANT PARTS OF RULES (patterns loaded from files) - HashMap hmDateQuant = new HashMap(); - HashMap hmTimeQuant = new HashMap(); - HashMap hmDurationQuant = new HashMap(); - HashMap hmSetQuant = new HashMap(); + /** + * Exception thrown when a pattern could not be built. + */ + public static class InvalidPatternException extends RuntimeException { + private static final long serialVersionUID = 1L; - // FREQ PARTS OF RULES (patterns loaded from files) - HashMap hmDateFreq = new HashMap(); - HashMap hmTimeFreq = new HashMap(); - HashMap hmDurationFreq = new HashMap(); - HashMap hmSetFreq = new HashMap(); + public InvalidPatternException(String msg) { + super(msg); + } - // MOD PARTS OF RULES (patterns loaded from files) - HashMap hmDateMod = new HashMap(); - HashMap hmTimeMod = new HashMap(); - HashMap hmDurationMod = new HashMap(); - HashMap hmSetMod = new HashMap(); + public InvalidPatternException(String msg, Throwable cause) { + super(msg, cause); + } + } - // POS PARTS OF RULES (patterns loaded from files) - HashMap hmDatePosConstraint = new HashMap(); - HashMap hmTimePosConstraint = new HashMap(); - HashMap hmDurationPosConstraint = new HashMap(); - HashMap hmSetPosConstraint = new HashMap(); - - // EMPTYVALUE part of rules - HashMap hmDateEmptyValue = new HashMap(); - HashMap hmTimeEmptyValue = new HashMap(); - HashMap hmDurationEmptyValue = new HashMap(); - HashMap hmSetEmptyValue = new HashMap(); - - // FASTCHECK part of rules - HashMap hmDateFastCheck = new HashMap(); - HashMap hmTimeFastCheck = new HashMap(); - HashMap hmDurationFastCheck = new HashMap(); - HashMap hmSetFastCheck = new HashMap(); + List hmDateRules = new ArrayList<>(); + List hmDurationRules = new ArrayList<>(); + List hmTimeRules = new ArrayList<>(); + List hmSetRules = new ArrayList<>(); + List hmTemponymRules = new ArrayList<>(); - // TEMPONYM RULES (loaded from resource files) - HashMap hmTemponymPattern = new HashMap(); - HashMap hmTemponymNormalization = new HashMap(); - HashMap hmTemponymOffset = new HashMap(); - HashMap hmTemponymQuant = new HashMap(); - HashMap hmTemponymFreq = new HashMap(); - HashMap hmTemponymMod = new HashMap(); - HashMap hmTemponymPosConstraint = new HashMap(); - HashMap hmTemponymEmptyValue = new HashMap(); - HashMap hmTemponymFastCheck = new HashMap(); - /** - * Constructor calls the parent constructor that sets language/resource - * parameters and collects rules resources. + * Constructor calls the parent constructor that sets language/resource parameters and collects rules resources. * * @param language - * language of resources to be used + * language of resources to be used * @param load_temponym_resources - * whether temponym resources are loaded + * whether temponym resources are loaded */ - protected RuleManager(String language, Boolean load_temponym_resources) { + protected RuleManager(String language, boolean load_temponym_resources) { // Process Generic constructor with rules parameter super("rules", language); @@ -116,622 +87,203 @@ protected RuleManager(String language, Boolean load_temponym_resources) { } /** - * singleton producer. - * - * @return singleton instance of RuleManager - */ - public static RuleManager getInstance(Language language, Boolean load_temponym_resources) { - if(!instances.containsKey(language.getName())) { - RuleManager nm = new RuleManager(language.getResourceFolder(), load_temponym_resources); - instances.put(language.getName(), nm); - } - - return instances.get(language.getName()); - } - - /** - * READ THE RULES FROM THE FILES. The files have to be defined in the - * HashMap hmResourcesRules. + * READ THE RULES FROM THE FILES. The files have to be defined in the HashMap hmResourcesRules. * * @param hmResourcesRules - * rules to be interpreted + * rules to be interpreted * @param load_temponym_resources - * whether temponym resources are loaded + * whether temponym resources are loaded */ - public void readRules(ResourceMap hmResourcesRules, String language, Boolean load_temponym_resources) { - InputStream is = null; - InputStreamReader isr = null; - BufferedReader br = null; - - LinkedList resourceKeys = new LinkedList(hmResourcesRules.keySet()); - - // sort DATE > TIME > DURATION > SET > rest - Collections.sort(resourceKeys, new Comparator() { - @Override - public int compare(String arg0, String arg1) { - if("daterules".equals(arg0)) { - return -1; - } else if("timerules".equals(arg0) && !"daterules".equals(arg1)) { - return -1; - } else if("durationrules".equals(arg0) && !"daterules".equals(arg1) && !"timerules".equals(arg1)) { - return -1; - } else if("setrules".equals(arg0) && !"daterules".equals(arg1) && !"timerules".equals(arg1) && !"durationrules".equals(arg1)) { - return -1; + public void readRules(ResourceMap hmResourcesRules, String language, boolean load_temponym_resources) { + RePatternManager rpm = RePatternManager.getInstance(Language.getLanguageFromString(language), load_temponym_resources); + // PATTERNS TO READ RESOURCES "RULES" AND "NORMALIZATION" + Matcher maReadRules = Pattern.compile("^RULENAME=\"(.*?)\",EXTRACTION=\"(.*?)\",NORM_VALUE=\"(.*?)\"(.*)").matcher(""); + + Matcher maAdditional = Pattern.compile("(?<=,)(OFFSET|NORM_QUANT|NORM_FREQ|NORM_MOD|POS_CONSTRAINT|EMPTY_VALUE|FAST_CHECK)=\"(.*?)\" *(?=,|$)").matcher(""); + + for (String resource : hmResourcesRules.keySet()) { + try (InputStream is = hmResourcesRules.getInputStream(resource); // + InputStreamReader isr = new InputStreamReader(is, "UTF-8"); // + BufferedReader br = new BufferedReader(isr)) { + List rules; + switch (resource) { + case "daterules": + rules = hmDateRules; + break; + case "durationrules": + rules = hmDurationRules; + break; + case "setrules": + rules = hmSetRules; + break; + case "timerules": + rules = hmTimeRules; + break; + case "temponymrules": + rules = hmTemponymRules; + break; + case "intervalrules": // Handled separately. + continue; + default: + LOG.debug("Resource type not recognized by HeidelTime: {}", resource); + continue; } - return 1; - } - }); - - try { - for (String resource : resourceKeys) { - is = hmResourcesRules.getInputStream(resource); - isr = new InputStreamReader(is, "UTF-8"); - br = new BufferedReader(isr); - - Logger.printDetail(component, "Adding rule resource: " + resource); - for (String line; (line = br.readLine()) != null;) { + + LOG.debug("Adding rule resource: {}", resource); + lines: for (String line; (line = br.readLine()) != null;) { // skip comments or empty lines in resource files if (line.startsWith("//") || line.equals("")) continue; - - boolean correctLine = false; - Logger.printDetail("DEBUGGING: reading rules..." + line); + + LOG.trace("Reading rules... {}", line); // check each line for the name, extraction, and // normalization part, others are optional - for (MatchResult r : Toolbox.findMatches(paReadRules, line)) { - correctLine = true; - String rule_name = r.group(1); - String rule_extraction = replaceSpaces(r.group(2)); - String rule_normalization = r.group(3); - String rule_offset = ""; - String rule_quant = ""; - String rule_freq = ""; - String rule_mod = ""; - String pos_constraint = ""; - String rule_empty_value = ""; - String rule_fast_check = ""; - - // throw an error if the rule's name already exists - if(hmDatePattern.containsValue(rule_name) || - hmDurationPattern.containsValue(rule_name) || - hmSetPattern.containsValue(rule_name) || - hmTimePattern.containsValue(rule_name)) { - Logger.printError("WARNING: Duplicate rule name detected. This rule is being ignored:"); - Logger.printError(line); - } - - // ////////////////////////////////////////////////////////////////// - // RULE EXTRACTION PARTS ARE TRANSLATED INTO REGULAR - // EXPRESSSIONS // - // ////////////////////////////////////////////////////////////////// - // create pattern for rule extraction part - Pattern paVariable = Pattern.compile("%(re[a-zA-Z0-9]*)"); - RePatternManager rpm = RePatternManager.getInstance(Language.getLanguageFromString(language), load_temponym_resources); - for (MatchResult mr : Toolbox.findMatches(paVariable, rule_extraction)) { - Logger.printDetail("DEBUGGING: replacing patterns..." + mr.group()); - if (!(rpm.containsKey(mr.group(1)))) { - Logger.printError("Error creating rule:" + rule_name); - Logger.printError("The following pattern used in this rule does not exist, does it? %" + mr.group(1)); - System.exit(-1); - } - rule_extraction = rule_extraction.replaceAll("%" + mr.group(1), rpm.get(mr.group(1))); - } - rule_extraction = rule_extraction.replaceAll(" ", "[\\\\s]+"); - Pattern pattern = null; - try { - pattern = Pattern.compile(rule_extraction); - } catch (java.util.regex.PatternSyntaxException e) { - Logger.printError("Compiling rules resulted in errors."); - Logger.printError("Problematic rule is " + rule_name); - Logger.printError("Cannot compile pattern: " + rule_extraction); - e.printStackTrace(); - System.exit(-1); + maReadRules.reset(line); + if (!maReadRules.find()) { + LOG.error("Cannot read the following line of rule resource {} line: {}", resource, line); + continue lines; + } + String rule_name = maReadRules.group(1); + String rule_extraction = maReadRules.group(2); + String rule_normalization = maReadRules.group(3); + + // throw an error if the rule's name already exists + for (Rule existing : rules) + if (existing.name.equals(rule_name)) { + LOG.warn("WARNING: Duplicate rule name detected. This rule is being ignored: {}", line); + continue lines; } - // Pattern pattern = Pattern.compile(rule_extraction); - // /////////////////////////////////// - // CHECK FOR ADDITIONAL CONSTRAINS // - // /////////////////////////////////// - Pattern patternFast = null; - if (!(r.group(4) == null)) { - if (r.group(4).contains("OFFSET")) { - Pattern paOffset = Pattern - .compile("OFFSET=\"(.*?)\""); - for (MatchResult ro : Toolbox.findMatches( - paOffset, line)) { - rule_offset = ro.group(1); - } - } - if (r.group(4).contains("NORM_QUANT")) { - Pattern paQuant = Pattern - .compile("NORM_QUANT=\"(.*?)\""); - for (MatchResult rq : Toolbox.findMatches( - paQuant, line)) { - rule_quant = rq.group(1); - } - } - if (r.group(4).contains("NORM_FREQ")) { - Pattern paFreq = Pattern - .compile("NORM_FREQ=\"(.*?)\""); - for (MatchResult rf : Toolbox.findMatches( - paFreq, line)) { - rule_freq = rf.group(1); - } - } - if (r.group(4).contains("NORM_MOD")) { - Pattern paMod = Pattern - .compile("NORM_MOD=\"(.*?)\""); - for (MatchResult rf : Toolbox.findMatches( - paMod, line)) { - rule_mod = rf.group(1); - } - } - if (r.group(4).contains("POS_CONSTRAINT")) { - Pattern paPos = Pattern - .compile("POS_CONSTRAINT=\"(.*?)\""); - for (MatchResult rp : Toolbox.findMatches( - paPos, line)) { - pos_constraint = rp.group(1); - } - } - if (r.group(4).contains("EMPTY_VALUE")) { - Pattern paEmpty = Pattern - .compile("EMPTY_VALUE=\"(.*?)\""); - for (MatchResult rp : Toolbox.findMatches( - paEmpty, line)) { - rule_empty_value = rp.group(1); - } - } - if (r.group(4).contains("FAST_CHECK")) { - Pattern paFast = Pattern - .compile("FAST_CHECK=\"(.*?)\""); - for (MatchResult rp : Toolbox.findMatches( - paFast, line)) { - rule_fast_check = rp.group(1); - // create pattern for rule fast check part -- similar to extraction part - // thus using paVariable and rpm - for (MatchResult mr : Toolbox.findMatches(paVariable, rule_fast_check)) { - Logger.printDetail("DEBUGGING: replacing patterns..." + mr.group()); - if (!(rpm.containsKey(mr.group(1)))) { - Logger.printError("Error creating rule:" + rule_name); - Logger.printError("The following pattern used in this rule does not exist, does it? %" + mr.group(1)); - System.exit(-1); - } - rule_fast_check = rule_fast_check.replaceAll("%" + mr.group(1), rpm.get(mr.group(1))); - } - rule_fast_check = rule_fast_check.replaceAll(" ", "[\\\\s]+"); - patternFast = null; - try { - patternFast = Pattern.compile(rule_fast_check); - } catch (java.util.regex.PatternSyntaxException e) { - Logger.printError("Compiling rules resulted in errors."); - Logger.printError("Problematic rule is " + rule_name); - Logger.printError("Cannot compile pattern: " + rule_fast_check); - e.printStackTrace(); - System.exit(-1); - } + rule_extraction = expandVariables(rule_name, rule_extraction, rpm); + rule_extraction = replaceSpaces(rule_extraction); + Pattern pattern = null; + try { + LOG.trace("Compiling pattern {}: {}", rule_name, rule_extraction); + pattern = Pattern.compile(rule_extraction); + } catch (java.util.regex.PatternSyntaxException e) { + LOG.error("Compiling rules resulted in errors.", e); + LOG.error("Problematic rule is {}", rule_name); + LOG.error("Cannot compile pattern: {}", rule_extraction); + throw new InvalidPatternException("Pattern compilation error in '" + rule_name + "'", e); + } + // Pattern pattern = Pattern.compile(rule_extraction); + Rule rule = new Rule(rule_name, pattern, rule_normalization); + + // /////////////////////////////////// + // CHECK FOR ADDITIONAL CONSTRAINS // + // /////////////////////////////////// + if (maReadRules.group(4) != null) { + maAdditional.reset(line); + while (maAdditional.find()) { + String rulename = maAdditional.group(1); + if (rulename.equals("OFFSET")) { + rule.offset = maAdditional.group(2); + } else if (rulename.equals("NORM_QUANT")) { + rule.quant = maAdditional.group(2); + } else if (rulename.equals("NORM_FREQ")) { + rule.freq = maAdditional.group(2); + } else if (rulename.equals("NORM_MOD")) { + rule.mod = maAdditional.group(2); + } else if (rulename.equals("POS_CONSTRAINT")) { + rule.posConstratint = maAdditional.group(2); + } else if (rulename.equals("EMPTY_VALUE")) { + rule.emptyValue = maAdditional.group(2); + } else if (rulename.equals("FAST_CHECK")) { + String rule_fast_check = maAdditional.group(2); + // create pattern for rule fast check part -- similar to extraction part + // thus using paVariable and rpm + rule_fast_check = expandVariables(rule_name, rule_fast_check, rpm); + rule_fast_check = replaceSpaces(rule_fast_check); + try { + rule.fastCheck = Pattern.compile(rule_fast_check); + } catch (java.util.regex.PatternSyntaxException e) { + LOG.error("Compiling rules resulted in errors.", e); + LOG.error("Problematic rule is {}", rule_name); + LOG.error("Cannot compile pattern: {}", rule_fast_check); + throw new InvalidPatternException("Pattern compilation error in '" + rule_name + "'", e); } + } else { + LOG.warn("Unknown additional constraint: {}", maAdditional.group()); } } - - // /////////////////////////////////////////// - // READ DATE RULES AND MAKE THEM AVAILABLE // - // /////////////////////////////////////////// - if (resource.equals("daterules")) { - // get extraction part - hmDatePattern.put(pattern, rule_name); - // get normalization part - hmDateNormalization.put(rule_name, - rule_normalization); - // get offset part - if (!(rule_offset.equals(""))) { - hmDateOffset.put(rule_name, rule_offset); - } - // get quant part - if (!(rule_quant.equals(""))) { - hmDateQuant.put(rule_name, rule_quant); - } - // get freq part - if (!(rule_freq.equals(""))) { - hmDateFreq.put(rule_name, rule_freq); - } - // get mod part - if (!(rule_mod.equals(""))) { - hmDateMod.put(rule_name, rule_mod); - } - // get pos constraint part - if (!(pos_constraint.equals(""))) { - hmDatePosConstraint.put(rule_name, - pos_constraint); - } - // get empty value part - if (!(rule_empty_value.equals(""))) { - hmDateEmptyValue.put(rule_name, - rule_empty_value); - } - // get fast check part - if (!(rule_fast_check.equals(""))) { - hmDateFastCheck.put(rule_name, - patternFast); - } - } - - // /////////////////////////////////////////////// - // READ DURATION RULES AND MAKE THEM AVAILABLE // - // /////////////////////////////////////////////// - else if (resource.equals("durationrules")) { - // get extraction part - hmDurationPattern.put(pattern, rule_name); - // get normalization part - hmDurationNormalization.put(rule_name, - rule_normalization); - // get offset part - if (!(rule_offset.equals(""))) { - hmDurationOffset.put(rule_name, rule_offset); - } - // get quant part - if (!(rule_quant.equals(""))) { - hmDurationQuant.put(rule_name, rule_quant); - } - // get freq part - if (!(rule_freq.equals(""))) { - hmDurationFreq.put(rule_name, rule_freq); - } - // get mod part - if (!(rule_mod.equals(""))) { - hmDurationMod.put(rule_name, rule_mod); - } - // get pos constraint part - if (!(pos_constraint.equals(""))) { - hmDurationPosConstraint.put(rule_name, - pos_constraint); - } - // get empty value part - if (!(rule_empty_value.equals(""))) { - hmDurationEmptyValue.put(rule_name, - rule_empty_value); - } - // get fast check part - if (!(rule_fast_check.equals(""))) { - hmDurationFastCheck.put(rule_name, - patternFast); - } - } - - // ////////////////////////////////////////// - // READ SET RULES AND MAKE THEM AVAILABLE // - // ////////////////////////////////////////// - else if (resource.equals("setrules")) { - // get extraction part - hmSetPattern.put(pattern, rule_name); - // get normalization part - hmSetNormalization.put(rule_name, - rule_normalization); - // get offset part - if (!rule_offset.equals("")) { - hmSetOffset.put(rule_name, rule_offset); - } - // get quant part - if (!rule_quant.equals("")) { - hmSetQuant.put(rule_name, rule_quant); - } - // get freq part - if (!rule_freq.equals("")) { - hmSetFreq.put(rule_name, rule_freq); - } - // get mod part - if (!rule_mod.equals("")) { - hmSetMod.put(rule_name, rule_mod); - } - // get pos constraint part - if (!pos_constraint.equals("")) { - hmSetPosConstraint.put(rule_name, - pos_constraint); - } - // get empty value part - if (!(rule_empty_value.equals(""))) { - hmSetEmptyValue.put(rule_name, - rule_empty_value); - } - // get fast check part - if (!(rule_fast_check.equals(""))) { - hmSetFastCheck.put(rule_name, - patternFast); - } - } - - // /////////////////////////////////////////// - // READ TIME RULES AND MAKE THEM AVAILABLE // - // /////////////////////////////////////////// - else if (resource.equals("timerules")) { - // get extraction part - hmTimePattern.put(pattern, rule_name); - // get normalization part - hmTimeNormalization.put(rule_name, - rule_normalization); - // get offset part - if (!rule_offset.equals("")) { - hmTimeOffset.put(rule_name, rule_offset); - } - // get quant part - if (!rule_quant.equals("")) { - hmTimeQuant.put(rule_name, rule_quant); - } - // get freq part - if (!rule_freq.equals("")) { - hmTimeFreq.put(rule_name, rule_freq); - } - // get mod part - if (!rule_mod.equals("")) { - hmTimeMod.put(rule_name, rule_mod); - } - // get pos constraint part - if (!pos_constraint.equals("")) { - hmTimePosConstraint.put(rule_name, - pos_constraint); - } - // get empty value part - if (!(rule_empty_value.equals(""))) { - hmTimeEmptyValue.put(rule_name, - rule_empty_value); - } - // get fast check part - if (!(rule_fast_check.equals(""))) { - hmTimeFastCheck.put(rule_name, - patternFast); - } - } - // ////////////////////////////////////////////// - // READ TEMPONYM RULES AND MAKE THEM AVAILABLE // - // ////////////////////////////////////////////// - else if (resource.equals("temponymrules")) { - // get extraction part - hmTemponymPattern.put(pattern, rule_name); - // get normalization part - hmTemponymNormalization.put(rule_name, - rule_normalization); - // get offset part - if (!(rule_offset.equals(""))) { - hmTemponymOffset.put(rule_name, rule_offset); - } - // get quant part - if (!(rule_quant.equals(""))) { - hmTemponymQuant.put(rule_name, rule_quant); - } - // get freq part - if (!(rule_freq.equals(""))) { - hmTemponymFreq.put(rule_name, rule_freq); - } - // get mod part - if (!(rule_mod.equals(""))) { - hmTemponymMod.put(rule_name, rule_mod); - } - // get pos constraint part - if (!(pos_constraint.equals(""))) { - hmTemponymPosConstraint.put(rule_name, - pos_constraint); - } - // get empty value part - if (!(rule_empty_value.equals(""))) { - hmTemponymEmptyValue.put(rule_name, - rule_empty_value); - } - // get fast check part - if (!(rule_fast_check.equals(""))) { - hmTemponymFastCheck.put(rule_name, - patternFast); - } - } else { - Logger.printDetail(component, "Resource not recognized by HeidelTime: " + resource); - } } - // ///////////////////////////////////////// - // CHECK FOR PROBLEMS WHEN READING RULES // - // ///////////////////////////////////////// - if (!correctLine) { - Logger.printError(component, "Cannot read the following line of rule resource " + resource); - Logger.printError(component, "Line: " + line); - } - - } - } - } catch (IOException e) { - e.printStackTrace(); - } finally { - try { - if(br != null) { - br.close(); - } - if(isr != null) { - isr.close(); - } - if(is != null) { - is.close(); + rules.add(rule); } - } catch(Exception e) { - e.printStackTrace(); + } catch (IOException e) { + LOG.error(e.getMessage(), e); + throw new RuntimeException("Cannot load patterns: " + e.getMessage(), e); } } } - public final HashMap getHmDatePattern() { - return hmDatePattern; - } - - public final HashMap getHmDurationPattern() { - return hmDurationPattern; - } - - public final HashMap getHmTimePattern() { - return hmTimePattern; - } - - public final HashMap getHmSetPattern() { - return hmSetPattern; - } - - public final HashMap getHmDateNormalization() { - return hmDateNormalization; - } - - public final HashMap getHmTimeNormalization() { - return hmTimeNormalization; - } - - public final HashMap getHmDurationNormalization() { - return hmDurationNormalization; - } - - public final HashMap getHmSetNormalization() { - return hmSetNormalization; - } - - public final HashMap getHmDateOffset() { - return hmDateOffset; - } - - public final HashMap getHmTimeOffset() { - return hmTimeOffset; - } - - public final HashMap getHmDurationOffset() { - return hmDurationOffset; - } - - public final HashMap getHmSetOffset() { - return hmSetOffset; - } - - public final HashMap getHmDateQuant() { - return hmDateQuant; - } - - public final HashMap getHmTimeQuant() { - return hmTimeQuant; - } - - public final HashMap getHmDurationQuant() { - return hmDurationQuant; - } - - public final HashMap getHmSetQuant() { - return hmSetQuant; - } - - public final HashMap getHmDateFreq() { - return hmDateFreq; - } - - public final HashMap getHmTimeFreq() { - return hmTimeFreq; - } - - public final HashMap getHmDurationFreq() { - return hmDurationFreq; - } - - public final HashMap getHmSetFreq() { - return hmSetFreq; - } - - public final HashMap getHmDateMod() { - return hmDateMod; - } - - public final HashMap getHmTimeMod() { - return hmTimeMod; - } - - public final HashMap getHmDurationMod() { - return hmDurationMod; - } - - public final HashMap getHmSetMod() { - return hmSetMod; - } - - public final HashMap getHmDatePosConstraint() { - return hmDatePosConstraint; - } - - public final HashMap getHmTimePosConstraint() { - return hmTimePosConstraint; - } - - public final HashMap getHmDurationPosConstraint() { - return hmDurationPosConstraint; - } - - public final HashMap getHmSetPosConstraint() { - return hmSetPosConstraint; - } - - public final HashMap getHmDateEmptyValue() { - return hmDateEmptyValue; - } - - public final HashMap getHmTimeEmptyValue() { - return hmTimeEmptyValue; - } - - public final HashMap getHmDurationEmptyValue() { - return hmDurationEmptyValue; - } - - public final HashMap getHmSetEmptyValue() { - return hmSetEmptyValue; - } - - public final HashMap getHmTemponymPattern() { - return hmTemponymPattern; - } - - public final HashMap getHmTemponymNormalization() { - return hmTemponymNormalization; - } - - public final HashMap getHmTemponymOffset() { - return hmTemponymOffset; - } - - public final HashMap getHmTemponymQuant() { - return hmTemponymQuant; - } - - public final HashMap getHmTemponymFreq() { - return hmTemponymFreq; - } - - public final HashMap getHmTemponymMod() { - return hmTemponymMod; - } - - public final HashMap getHmTemponymPosConstraint() { - return hmTemponymPosConstraint; - } - - public final HashMap getHmTemponymEmptyValue() { - return hmTemponymEmptyValue; + private static final Pattern paVariable = Pattern.compile("%(?:(re[a-zA-Z0-9]+)|\\((re[a-zA-Z0-9]+(?:\\|re[a-zA-Z0-9]+)*)\\))"); + private static final Pattern paSplit = Pattern.compile("%?\\|"); + + public static String expandVariables(CharSequence rule_name, String str, RePatternManager rpm) { + Matcher matcher = paVariable.matcher(str); + // Shortcut if no matches: + if (!matcher.find()) + return str; + StringBuilder buf = new StringBuilder(1000); + int pos = 0; + do { + List pats = new ArrayList<>(); + String g1 = matcher.group(1); + if (g1 != null) { + // Only one group matched. + String rep = rpm.get(g1); + if (rep == null) { + LOG.error("Error expanding rule '{}': RePattern not defined: '%{}'", rule_name, g1); + throw new InvalidPatternException("Rule '" + rule_name + "' referenced missing pattern '" + g1 + "'"); + } + pats.add(rep); + } else { + // Split, lookup, and join group(2). + String[] parts = paSplit.split(matcher.group(2)); + for (int i = 0; i < parts.length; i++) { + String rep = rpm.get(parts[i]); + if (rep == null) { + LOG.error("Error expanding rule '{}': RePattern not defined: '%{}'", rule_name, parts[i]); + throw new InvalidPatternException("Rule '" + rule_name + "' referenced missing pattern '" + parts[i] + "'"); + } + pats.add(rep); + } + } + if (pats.size() > 1) + pats = RePatternManager.optimizePatterns(rule_name, pats); + int start = matcher.start(), end = matcher.end(); + if (pos < start) + buf.append(str, pos, start); + Iterator it = pats.iterator(); + buf.append('(').append(it.next()); // first + while (it.hasNext()) + buf.append('|').append(it.next()); + buf.append(')'); + pos = end; + } while (matcher.find()); + if (pos < str.length()) + buf.append(str, pos, str.length()); + return buf.toString(); } - public final HashMap getHmDateFastCheck() { - return hmDateFastCheck; + public final List getHmDateRules() { + return hmDateRules; } - public final HashMap getHmTimeFastCheck() { - return hmTimeFastCheck; + public final List getHmDurationRules() { + return hmDurationRules; } - public final HashMap getHmDurationFastCheck() { - return hmDurationFastCheck; + public final List getHmTimeRules() { + return hmTimeRules; } - public final HashMap getHmSetFastCheck() { - return hmSetFastCheck; + public final List getHmSetRules() { + return hmSetRules; } - public final HashMap getHmTemponymFastCheck() { - return hmTemponymFastCheck; + public final List getHmTemponymRules() { + return hmTemponymRules; } } diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/utilities/ChineseNumbers.java b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/ChineseNumbers.java new file mode 100644 index 00000000..90b934e7 --- /dev/null +++ b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/ChineseNumbers.java @@ -0,0 +1,64 @@ +package de.unihd.dbs.uima.annotator.heideltime.utilities; + +import java.util.HashMap; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ChineseNumbers { + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(ChineseNumbers.class); + + static HashMap CHINESE_NUMBERS_MAP = new HashMap<>(); + + static { + CHINESE_NUMBERS_MAP.put('零', '0'); + CHINESE_NUMBERS_MAP.put('一', '1'); + CHINESE_NUMBERS_MAP.put('二', '2'); + CHINESE_NUMBERS_MAP.put('三', '3'); + CHINESE_NUMBERS_MAP.put('四', '4'); + CHINESE_NUMBERS_MAP.put('五', '5'); + CHINESE_NUMBERS_MAP.put('六', '6'); + CHINESE_NUMBERS_MAP.put('七', '7'); + CHINESE_NUMBERS_MAP.put('八', '8'); + CHINESE_NUMBERS_MAP.put('九', '9'); + // Unicode arabic-lookalikes (wide) + CHINESE_NUMBERS_MAP.put('0', '0'); + CHINESE_NUMBERS_MAP.put('1', '1'); + CHINESE_NUMBERS_MAP.put('2', '2'); + CHINESE_NUMBERS_MAP.put('3', '3'); + CHINESE_NUMBERS_MAP.put('4', '4'); + CHINESE_NUMBERS_MAP.put('5', '5'); + CHINESE_NUMBERS_MAP.put('6', '6'); + CHINESE_NUMBERS_MAP.put('7', '7'); + CHINESE_NUMBERS_MAP.put('8', '8'); + CHINESE_NUMBERS_MAP.put('9', '9'); + // Allow real arabic, too. + CHINESE_NUMBERS_MAP.put('0', '0'); + CHINESE_NUMBERS_MAP.put('1', '1'); + CHINESE_NUMBERS_MAP.put('2', '2'); + CHINESE_NUMBERS_MAP.put('3', '3'); + CHINESE_NUMBERS_MAP.put('4', '4'); + CHINESE_NUMBERS_MAP.put('5', '5'); + CHINESE_NUMBERS_MAP.put('6', '6'); + CHINESE_NUMBERS_MAP.put('7', '7'); + CHINESE_NUMBERS_MAP.put('8', '8'); + CHINESE_NUMBERS_MAP.put('9', '9'); + } + + public static String normalize(String chinese) { + String outString = ""; + for (int i = 0; i < chinese.length(); i++) { + char thisChar = chinese.charAt(i); + Character rep = CHINESE_NUMBERS_MAP.get((Character) thisChar); + if (rep != null) { + outString += rep; + } else { + // System.out.println(chineseNumerals.entrySet()); + LOG.error("Found an error in the resources: " + chinese + " contains " + "a character that is not defined in the Chinese numerals map. Normalization may be mangled."); + outString += thisChar; + } + } + return outString; + } +} \ No newline at end of file diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/utilities/ContextAnalyzer.java b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/ContextAnalyzer.java index 412a5adf..25d22f5b 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/utilities/ContextAnalyzer.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/ContextAnalyzer.java @@ -1,12 +1,17 @@ package de.unihd.dbs.uima.annotator.heideltime.utilities; +import java.util.ArrayList; +import java.util.Comparator; import java.util.List; -import java.util.TreeMap; -import java.util.regex.MatchResult; +import java.util.function.Function; +import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.jcas.JCas; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import de.unihd.dbs.uima.annotator.heideltime.resources.Language; import de.unihd.dbs.uima.annotator.heideltime.resources.NormalizationManager; @@ -14,608 +19,574 @@ import de.unihd.dbs.uima.types.heideltime.Sentence; import de.unihd.dbs.uima.types.heideltime.Timex3; import de.unihd.dbs.uima.types.heideltime.Token; + /** + * This class contains methods that work with the dependence of a subject with its surrounding data; namely via the jcas element or a subset list. * - * This class contains methods that work with the dependence of a subject with its - * surrounding data; namely via the jcas element or a subset list. * @author jannik stroetgen - * */ public class ContextAnalyzer { + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(ContextAnalyzer.class); + + public static enum Tense { + PRESENTFUTURE, PAST, FUTURE + } + + private static final Pattern BC_TWO_DIGITS = Pattern.compile("^(?:BC)?[0-9][0-9]"); + private static final Pattern BC_THREE_DIGITS = Pattern.compile("^(?:BC)?[0-9][0-9][0-9]"); + private static final Pattern BC_FOUR_DIGITS = Pattern.compile("^(?:BC)?[0-9][0-9][0-9][0-9]"); + private static final Pattern BC_YEAR_MON = Pattern.compile("^(?:BC)?[0-9][0-9][0-9][0-9]-[0-9][0-9]"); + private static final Pattern YEAR_MON = Pattern.compile("^[0-9][0-9][0-9][0-9]-[0-9][0-9]"); + private static final Pattern YEAR_MON_WK = Pattern.compile("^[0-9][0-9][0-9][0-9]-W[0-9][0-9]"); + private static final Pattern YEAR_MON_DAY = Pattern.compile("^([0-9][0-9][0-9][0-9])-([0-9][0-9])-([0-9][0-9])"); + private static final Pattern YEAR_QUARTER = Pattern.compile("^[0-9][0-9][0-9][0-9]-Q[1234]"); + private static final Pattern YEAR_SEASON = Pattern.compile("^[0-9][0-9][0-9][0-9]-(SP|SU|FA|WI)"); + + private static final Pattern PREVUE_ENVISAGEE = Pattern.compile("^(?:prévue?s?|envisagée?s?)$"); + /** * The value of the x of the last mentioned Timex is calculated. - * @param linearDates list of previous linear dates - * @param i index for the previous date entry - * @param x type to search for + * + * Within the same sentence, prefer a longer timex (e.g. a day vs. a year). + * + * @param linearDates + * list of previous linear dates + * @param i + * index for the previous date entry + * @param x + * type to search for * @return last mentioned entry */ - public static String getLastMentionedX(List linearDates, int i, String x, Language language) { - NormalizationManager nm = NormalizationManager.getInstance(language, true); - + public static String getLastMentionedX(List linearDates, int i, Function func) { // Timex for which to get the last mentioned x (i.e., Timex i) Timex3 t_i = linearDates.get(i); - - String xValue = ""; - int j = i - 1; - while (j >= 0) { + final int t_i_begin = t_i.getBegin(); + + String bestrep = null, bestin = null; + int bestsen = -1, besti = -1; + for (int j = i - 1; j >= 0; --j) { Timex3 timex = linearDates.get(j); + if (j < besti - 5 || bestrep != null && timex.getSentId() < bestsen - 1) + break; // Don't go further back. // check that the two timexes to compare do not have the same offset: - if (!(t_i.getBegin() == timex.getBegin())) { - - String value = timex.getTimexValue(); - if (!(value.contains("funcDate"))){ - if (x.equals("century")) { - if (value.matches("^[0-9][0-9].*")) { - xValue = value.substring(0,2); - break; - } - else if (value.matches("^BC[0-9][0-9].*")){ - xValue = value.substring(0,4); - break; - } - else { - j--; - } - } - else if (x.equals("decade")) { - if (value.matches("^[0-9][0-9][0-9].*")) { - xValue = value.substring(0,3); - break; - } - else if (value.matches("^BC[0-9][0-9][0-9].*")){ - xValue = value.substring(0,5); - break; - } - else { - j--; - } - } - else if (x.equals("year")) { - if (value.matches("^[0-9][0-9][0-9][0-9].*")) { - xValue = value.substring(0,4); - break; - } - else if (value.matches("^BC[0-9][0-9][0-9][0-9].*")){ - xValue = value.substring(0,6); - break; - } - else { - j--; - } - } - else if (x.equals("dateYear")) { - if (value.matches("^[0-9][0-9][0-9][0-9].*")) { - xValue = value; - break; - } - else if (value.matches("^BC[0-9][0-9][0-9][0-9].*")){ - xValue = value; - break; - } - else { - j--; - } - } - else if (x.equals("month")) { - if (value.matches("^[0-9][0-9][0-9][0-9]-[0-9][0-9].*")) { - xValue = value.substring(0,7); - break; - } - else if (value.matches("^BC[0-9][0-9][0-9][0-9]-[0-9][0-9].*")){ - xValue = value.substring(0,9); - break; - } - else { - j--; - } - } - else if (x.equals("month-with-details")) { - if (value.matches("^[0-9][0-9][0-9][0-9]-[0-9][0-9].*")) { - xValue = value; - break; - } -// else if (value.matches("^BC[0-9][0-9][0-9][0-9]-[0-9][0-9].*")) { -// xValue = value; -// break; -// } - else { - j--; - } - } - else if (x.equals("day")) { - if (value.matches("^[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9].*")) { - xValue = value.substring(0,10); - break; - } -// else if (value.matches("^BC[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9].*")) { -// xValue = value.substring(0,12); -// break; -// } - else { - j--; - } - } - else if (x.equals("week")) { - if (value.matches("^[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9].*")) { - for (MatchResult r : Toolbox.findMatches(Pattern.compile("^(([0-9][0-9][0-9][0-9])-[0-9][0-9]-[0-9][0-9]).*"), value)) { - xValue = r.group(2)+"-W"+DateCalculator.getWeekOfDate(r.group(1)); - break; - } - break; - } - else if (value.matches("^[0-9][0-9][0-9][0-9]-W[0-9][0-9].*")) { - for (MatchResult r : Toolbox.findMatches(Pattern.compile("^([0-9][0-9][0-9][0-9]-W[0-9][0-9]).*"), value)) { - xValue = r.group(1); - break; - } - break; - } - // TODO check what to do for BC times - else { - j--; - } - } - else if (x.equals("quarter")) { - if (value.matches("^[0-9][0-9][0-9][0-9]-[0-9][0-9].*")) { - String month = value.substring(5,7); - String quarter = nm.getFromNormMonthInQuarter(month); - if(quarter == null) { - quarter = "1"; - } - xValue = value.substring(0,4)+"-Q"+quarter; - break; - } - else if (value.matches("^[0-9][0-9][0-9][0-9]-Q[1234].*")) { - xValue = value.substring(0,7); - break; - } - // TODO check what to do for BC times - else { - j--; - } - } - else if (x.equals("dateQuarter")) { - if (value.matches("^[0-9][0-9][0-9][0-9]-Q[1234].*")) { - xValue = value.substring(0,7); - break; - } - // TODO check what to do for BC times - else { - j--; - } - } - else if (x.equals("season")) { - if (value.matches("^[0-9][0-9][0-9][0-9]-[0-9][0-9].*")) { - String month = value.substring(5,7); - String season = nm.getFromNormMonthInSeason(month); - xValue = value.substring(0,4)+"-"+season; - break; - } -// else if (value.matches("^BC[0-9][0-9][0-9][0-9]-[0-9][0-9].*")) { -// String month = value.substring(7,9); -// String season = nm.getFromNormMonthInSeason(month); -// xValue = value.substring(0,6)+"-"+season; -// break; -// } - else if (value.matches("^[0-9][0-9][0-9][0-9]-(SP|SU|FA|WI).*")) { - xValue = value.substring(0,7); - break; - } -// else if (value.matches("^BC[0-9][0-9][0-9][0-9]-(SP|SU|FA|WI).*")) { -// xValue = value.substring(0,9); -// break; -// } - else { - j--; - } - } - } else { - j--; - } - } else { - j--; + if (t_i_begin == timex.getBegin()) + continue; + String value = timex.getTimexValue(); + if (bestrep != null && value.length() <= bestin.length()) + continue; // Only try to find more precise dates. + if (value.contains("funcDate")) + continue; + String rep = func.apply(value); + if (rep != null) { + bestrep = rep; + bestin = value; + bestsen = timex.getSentId(); + // We don't care beyond month resolution, or we don't have sentences. + if (value.length() >= 6 || bestsen == 0) + break; + } + } + // If we did not find in the same sentence, try also looking forward a little bit + final int curSen = t_i.getSentId(); + if (besti != curSen && curSen > 0) { + final int t_i_end = t_i.getEnd(); + int end = Math.min(i + 2, linearDates.size()); + for (int j = i + 1; j < end; j++) { + Timex3 timex = linearDates.get(j); + if (bestrep != null && timex.getSentId() > curSen) + break; // Don't go further forward. + // check that the two timexes to compare do not have the same offset: + if (t_i_end > timex.getBegin()) + continue; + String value = timex.getTimexValue(); + if (bestrep != null && value.length() <= bestin.length()) + continue; // Only try to find more precise dates. + if (value.contains("funcDate")) + continue; + String rep = func.apply(value); + if (rep != null) { + bestrep = rep; + bestin = value; + bestsen = timex.getSentId(); + // We don't care beyond month resolution, or we don't have sentences. + if (value.length() >= 6 || bestsen == 0) + break; } + } } - return xValue; + return bestrep != null ? bestrep : ""; + } + + /** + * The value of the x of the last mentioned Timex is calculated. + * + * @param linearDates + * list of previous linear dates + * @param i + * index for the previous date entry + * @return last mentioned century + */ + public static String getLastMentionedCentury(List linearDates, int i) { + Matcher m = BC_TWO_DIGITS.matcher(""); + return getLastMentionedX(linearDates, i, value -> m.reset(value).find() ? m.group(0) : null); + } + + /** + * The value of the x of the last mentioned Timex is calculated. + * + * @param linearDates + * list of previous linear dates + * @param i + * index for the previous date entry + * @return last mentioned decade + */ + public static String getLastMentionedDecade(List linearDates, int i) { + Matcher m = BC_THREE_DIGITS.matcher(""); + return getLastMentionedX(linearDates, i, value -> m.reset(value).find() ? m.group(0) : null); + } + + /** + * The value of the x of the last mentioned Timex is calculated. + * + * @param linearDates + * list of previous linear dates + * @param i + * index for the previous date entry + * @return last mentioned year + */ + public static String getLastMentionedYear(List linearDates, int i) { + Matcher m = BC_FOUR_DIGITS.matcher(""); + return getLastMentionedX(linearDates, i, value -> m.reset(value).find() ? m.group(0) : null); + } + + /** + * The value of the x of the last mentioned Timex is calculated. + * + * @param linearDates + * list of previous linear dates + * @param i + * index for the previous date entry + * @return last mentioned date year + */ + public static String getLastMentionedDateYear(List linearDates, int i) { + final Matcher m = BC_FOUR_DIGITS.matcher(""); + // TODO: return group instead of value? + return getLastMentionedX(linearDates, i, value -> m.reset(value).find() ? value : null); + } + + /** + * The value of the x of the last mentioned Timex is calculated. + * + * @param linearDates + * list of previous linear dates + * @param i + * index for the previous date entry + * @return last mentioned month + */ + public static String getLastMentionedMonth(List linearDates, int i) { + Matcher m = BC_YEAR_MON.matcher(""); + return getLastMentionedX(linearDates, i, value -> m.reset(value).find() ? m.group(0) : null); + } + + /** + * The value of the x of the last mentioned Timex is calculated. + * + * @param linearDates + * list of previous linear dates + * @param i + * index for the previous date entry + * @return last mentioned month with details + */ + public static String getLastMentionedMonthDetails(List linearDates, int i) { + Matcher m = YEAR_MON.matcher(""); + return getLastMentionedX(linearDates, i, value -> m.reset(value).find() ? m.group(0) : null); + } + + /** + * The value of the x of the last mentioned Timex is calculated. + * + * @param linearDates + * list of previous linear dates + * @param i + * index for the previous date entry + * @return last mentioned day + */ + public static String getLastMentionedDay(List linearDates, int i) { + final Matcher m = YEAR_MON_DAY.matcher(""); + return getLastMentionedX(linearDates, i, value -> m.reset(value).find() ? value.substring(0, 10) : null); } - + + /** + * The value of the x of the last mentioned Timex is calculated. + * + * @param linearDates + * list of previous linear dates + * @param i + * index for the previous date entry + * @return last mentioned weeh + */ + public static String getLastMentionedWeek(List linearDates, int i) { + Matcher m1 = YEAR_MON_DAY.matcher(""); + Matcher m2 = YEAR_MON_WK.matcher(""); + return getLastMentionedX(linearDates, i, value -> // + m1.reset(value).find() ? (m1.group(1) + "-W" + DateCalculator.getWeekOfDate(m1.group(0))) : // + m2.reset(value).find() ? value /* group? */ : null // + ); + } + + /** + * The value of the x of the last mentioned Timex is calculated. + * + * @param linearDates + * list of previous linear dates + * @param i + * index for the previous date entry + * @param language + * Language + * @return last mentioned quarter + */ + public static String getLastMentionedQuarter(List linearDates, int i, Language language) { + final Matcher m1 = YEAR_MON.matcher(""); + final Matcher m2 = YEAR_QUARTER.matcher(""); + return getLastMentionedX(linearDates, i, value -> { + if (m1.reset(value).find()) { + NormalizationManager nm = NormalizationManager.getInstance(language, true); + String month = value.substring(5, 7); + String quarter = nm.getFromNormMonthInQuarter(month); + if (quarter == null) + quarter = "1"; + return value.substring(0, 4) + "-Q" + quarter; + } + return m2.reset(value).find() ? value.substring(0, 7) : null; + }); + } + + /** + * The value of the x of the last mentioned Timex is calculated. + * + * @param linearDates + * list of previous linear dates + * @param i + * index for the previous date entry + * @return last mentioned century + */ + public static String getLastMentionedDateQuarter(List linearDates, int i) { + final Matcher m = YEAR_QUARTER.matcher(""); + return getLastMentionedX(linearDates, i, value -> m.reset(value).find() ? value.substring(0, 7) : null); + } + + /** + * The value of the x of the last mentioned Timex is calculated. + * + * @param linearDates + * list of previous linear dates + * @param i + * index for the previous date entry + * @return last mentioned season + */ + public static String getLastMentionedSeason(List linearDates, int i, Language language) { + final Matcher m1 = YEAR_MON.matcher(""); + final Matcher m2 = YEAR_SEASON.matcher(""); + return getLastMentionedX(linearDates, i, value -> { + if (m1.reset(value).find()) { + NormalizationManager nm = NormalizationManager.getInstance(language, true); + String month = value.substring(5, 7); + String season = nm.getFromNormMonthInSeason(month); + return value.substring(0, 4) + "-" + season; + } + // if (value.matches("^BC[0-9][0-9][0-9][0-9]-[0-9][0-9].*")) { + // String month = value.substring(7,9); + // String season = nm.getFromNormMonthInSeason(month); + // return value.substring(0,6)+"-"+season; + // } + if (m2.reset(value).find()) + return value.substring(0, 7); + // else if (value.matches("^BC[0-9][0-9][0-9][0-9]-(SP|SU|FA|WI).*")) { + // return value.substring(0,9); + // } + return null; + }); + } + /** * Get the last tense used in the sentence * - * @param timex timex construct to discover tense data for + * @param timex + * timex construct to discover tense data for * @return string that contains the tense */ - public static String getClosestTense(Timex3 timex, JCas jcas, Language language) { + public static Tense getClosestTense(Timex3 timex, JCas jcas, Language language) { RePatternManager rpm = RePatternManager.getInstance(language, false); - - String lastTense = ""; - String nextTense = ""; - + Matcher tensePos4PresentFuture = rpm.getCompiled("tensePos4PresentFuture").matcher(""); + Matcher tensePos4Past = rpm.getCompiled("tensePos4Past").matcher(""); + Matcher tensePos4Future = rpm.getCompiled("tensePos4Future").matcher(""); + Matcher tenseWord4Future = rpm.getCompiled("tenseWord4Future").matcher(""); + + Tense lastTense = null, nextTense = null; + int tokenCounter = 0; - int lastid = 0; - int nextid = 0; - int tid = 0; + int lastid = 0, nextid = 0; + int tid = 0; - // Get the sentence - FSIterator iterSentence = jcas.getAnnotationIndex(Sentence.type).iterator(); - Sentence s = new Sentence(jcas); - while (iterSentence.hasNext()) { - s = (Sentence) iterSentence.next(); - if ((s.getBegin() <= timex.getBegin()) - && (s.getEnd() >= timex.getEnd())) { - break; - } - } + ArrayList tmToken = getCloseTokens(timex, jcas); - // Get the tokens - TreeMap tmToken = new TreeMap(); - FSIterator iterToken = jcas.getAnnotationIndex(Token.type).subiterator(s); - while (iterToken.hasNext()) { - Token token = (Token) iterToken.next(); - tmToken.put(token.getEnd(), token); - } - // Get the last VERB token - for (Integer tokEnd : tmToken.keySet()) { + for (Token token : tmToken) { tokenCounter++; - if (tokEnd < timex.getBegin()) { - Token token = tmToken.get(tokEnd); - - Logger.printDetail("GET LAST TENSE: string:"+token.getCoveredText()+" pos:"+token.getPos()); - Logger.printDetail("hmAllRePattern.containsKey(tensePos4PresentFuture):"+rpm.get("tensePos4PresentFuture")); - Logger.printDetail("hmAllRePattern.containsKey(tensePos4Future):"+rpm.get("tensePos4Future")); - Logger.printDetail("hmAllRePattern.containsKey(tensePos4Past):"+rpm.get("tensePos4Past")); - Logger.printDetail("CHECK TOKEN:"+token.getPos()); - - if (token.getPos() == null) { - - } - else if ((rpm.containsKey("tensePos4PresentFuture")) && (token.getPos().matches(rpm.get("tensePos4PresentFuture")))) { - lastTense = "PRESENTFUTURE"; - lastid = tokenCounter; + if (token.getEnd() < timex.getBegin()) { + String pos = token.getPos(); + if (pos == null) + continue; // POS not available? + + if (LOG.isTraceEnabled()) { + LOG.trace("GET LAST TENSE: string:" + token.getCoveredText() + " pos:" + pos); + LOG.trace("tensePos4PresentFuture pattern:" + tensePos4PresentFuture.pattern().pattern()); + LOG.trace("tensePos4Future pattern:" + tensePos4Future.pattern().pattern()); + LOG.trace("tensePos4Past pattern:" + tensePos4Past.pattern().pattern()); + LOG.trace("CHECK TOKEN: " + pos); } - else if ((rpm.containsKey("tensePos4Past")) && (token.getPos().matches(rpm.get("tensePos4Past")))) { - lastTense = "PAST"; + + if (tensePos4PresentFuture != null && tensePos4PresentFuture.reset(pos).matches()) { + lastTense = Tense.PRESENTFUTURE; lastid = tokenCounter; - } - else if ((rpm.containsKey("tensePos4Future")) && (token.getPos().matches(rpm.get("tensePos4Future")))) { - if (token.getCoveredText().matches(rpm.get("tenseWord4Future"))) { - lastTense = "FUTURE"; + } else if (tensePos4Past != null && tensePos4Past.reset(pos).matches()) { + lastTense = Tense.PAST; + lastid = tokenCounter; + } else if (tensePos4Future != null && tensePos4Future.reset(pos).matches()) { + if (tenseWord4Future.reset(token.getCoveredText()).matches()) { + lastTense = Tense.FUTURE; lastid = tokenCounter; } } - } - else { - if (tid == 0) { + } else { + if (tid == 0) tid = tokenCounter; - } } } tokenCounter = 0; - for (Integer tokEnd : tmToken.keySet()) { + for (Token token : tmToken) { tokenCounter++; - if (nextTense.equals("")) { - if (tokEnd > timex.getEnd()) { - Token token = tmToken.get(tokEnd); - - Logger.printDetail("GET NEXT TENSE: string:"+token.getCoveredText()+" pos:"+token.getPos()); - Logger.printDetail("hmAllRePattern.containsKey(tensePos4PresentFuture):"+rpm.get("tensePos4PresentFuture")); - Logger.printDetail("hmAllRePattern.containsKey(tensePos4Future):"+rpm.get("tensePos4Future")); - Logger.printDetail("hmAllRePattern.containsKey(tensePos4Past):"+rpm.get("tensePos4Past")); - Logger.printDetail("CHECK TOKEN:"+token.getPos()); - - if (token.getPos() == null) { - + if (nextTense == null) { + if (token.getEnd() > timex.getEnd()) { + String pos = token.getPos(); + if (pos == null) + continue; // No POS available? + + if (LOG.isTraceEnabled()) { + LOG.trace("GET NEXT TENSE: string:" + token.getCoveredText() + " pos:" + pos); + LOG.trace("tensePos4PresentFuture pattern:" + tensePos4PresentFuture.pattern().pattern()); + LOG.trace("tensePos4Future pattern:" + tensePos4Future.pattern().pattern()); + LOG.trace("tensePos4Past pattern:" + tensePos4Past.pattern().pattern()); + LOG.trace("CHECK TOKEN: " + pos); } - else if ((rpm.containsKey("tensePos4PresentFuture")) && (token.getPos().matches(rpm.get("tensePos4PresentFuture")))) { - nextTense = "PRESENTFUTURE"; + + if (tensePos4PresentFuture != null && tensePos4PresentFuture.reset(pos).matches()) { + nextTense = Tense.PRESENTFUTURE; nextid = tokenCounter; - } - else if ((rpm.containsKey("tensePos4Past")) && (token.getPos().matches(rpm.get("tensePos4Past")))) { - nextTense = "PAST"; + } else if (tensePos4Past != null && tensePos4Past.reset(pos).matches()) { + nextTense = Tense.PAST; nextid = tokenCounter; - } - else if ((rpm.containsKey("tensePos4Future")) && (token.getPos().matches(rpm.get("tensePos4Future")))) { - if (token.getCoveredText().matches(rpm.get("tenseWord4Future"))) { - nextTense = "FUTURE"; + } else if (tensePos4Future != null && tensePos4Future.reset(pos).matches()) { + if (tenseWord4Future.reset(token.getCoveredText()).matches()) { + nextTense = Tense.FUTURE; nextid = tokenCounter; } } } } } - if (lastTense.equals("")) { - Logger.printDetail("TENSE: "+nextTense); + if (lastTense == null) { + LOG.trace("TENSE: {}", nextTense); return nextTense; - } - else if (nextTense.equals("")) { - Logger.printDetail("TENSE: "+lastTense); + } else if (nextTense == null) { + LOG.trace("TENSE: {}", lastTense); return lastTense; - } - else { - // If there is tense before and after the timex token, + } else { + // If there is tense before and after the timex token, // return the closer one: if ((tid - lastid) > (nextid - tid)) { - Logger.printDetail("TENSE: "+nextTense); + LOG.trace("TENSE: {}", nextTense); return nextTense; + } else { + LOG.trace("TENSE: {}", lastTense); + return lastTense; } - else { - Logger.printDetail("TENSE: "+lastTense); - return lastTense; - } } } - - + /** * Get the last tense used in the sentence * - * @param timex timex construct to discover tense data for + * @param timex + * timex construct to discover tense data for * @return string that contains the tense */ - public static String getLastTense(Timex3 timex, JCas jcas, Language language) { + public static Tense getLastTense(Timex3 timex, JCas jcas, Language language) { RePatternManager rpm = RePatternManager.getInstance(language, false); - - String lastTense = ""; + Matcher tensePos4Past = rpm.getCompiled("tensePos4Past").matcher(""); + Matcher tensePos4Future = rpm.getCompiled("tensePos4Future").matcher(""); + Matcher tensePos4PresentFuture = rpm.getCompiled("tensePos4PresentFuture").matcher(""); + Matcher tenseWord4Future = rpm.getCompiled("tenseWord4Future").matcher(""); - // Get the sentence - FSIterator iterSentence = jcas.getAnnotationIndex(Sentence.type).iterator(); - Sentence s = new Sentence(jcas); - while (iterSentence.hasNext()) { - s = (Sentence) iterSentence.next(); - if ((s.getBegin() <= timex.getBegin()) - && (s.getEnd() >= timex.getEnd())) { - break; - } - } + Tense lastTense = null; - // Get the tokens - TreeMap tmToken = new TreeMap(); - FSIterator iterToken = jcas.getAnnotationIndex(Token.type).subiterator(s); - while (iterToken.hasNext()) { - Token token = (Token) iterToken.next(); - tmToken.put(token.getEnd(), token); - } + // Get the sentence + ArrayList tmToken = getCloseTokens(timex, jcas); // Get the last VERB token - for (Integer tokEnd : tmToken.keySet()) { - if (tokEnd < timex.getBegin()) { - Token token = tmToken.get(tokEnd); - - Logger.printDetail("GET LAST TENSE: string:"+token.getCoveredText()+" pos:"+token.getPos()); - Logger.printDetail("hmAllRePattern.containsKey(tensePos4PresentFuture):"+rpm.get("tensePos4PresentFuture")); - Logger.printDetail("hmAllRePattern.containsKey(tensePos4Future):"+rpm.get("tensePos4Future")); - Logger.printDetail("hmAllRePattern.containsKey(tensePos4Past):"+rpm.get("tensePos4Past")); - Logger.printDetail("CHECK TOKEN:"+token.getPos()); - - if (token.getPos() == null) { - - } - else if ((rpm.containsKey("tensePos4PresentFuture")) && (token.getPos().matches(rpm.get("tensePos4PresentFuture")))) { - lastTense = "PRESENTFUTURE"; - Logger.printDetail("this tense:"+lastTense); - } - else if ((rpm.containsKey("tensePos4Past")) && (token.getPos().matches(rpm.get("tensePos4Past")))) { - lastTense = "PAST"; - Logger.printDetail("this tense:"+lastTense); + for (Token token : tmToken) { + if (token.getEnd() < timex.getBegin()) { + String coveredText = token.getCoveredText(); + String pos = token.getPos(); + if (pos == null) + continue; // No POS available? + + if (LOG.isTraceEnabled()) { + LOG.trace("GET LAST TENSE: string:" + coveredText + " pos: " + pos); + LOG.trace("tensePos4PresentFuture pattern:" + tensePos4PresentFuture.pattern().pattern()); + LOG.trace("tensePos4Future pattern:" + tensePos4Future.pattern().pattern()); + LOG.trace("tensePos4Past pattern:" + tensePos4Past.pattern().pattern()); + LOG.trace("CHECK TOKEN: " + pos); } - else if ((rpm.containsKey("tensePos4Future")) && (token.getPos().matches(rpm.get("tensePos4Future")))) { - if (token.getCoveredText().matches(rpm.get("tenseWord4Future"))) { - lastTense = "FUTURE"; - Logger.printDetail("this tense:"+lastTense); + + if (tensePos4PresentFuture != null && tensePos4PresentFuture.reset(pos).matches()) { + lastTense = Tense.PRESENTFUTURE; + } else if (tensePos4Past != null && tensePos4Past.reset(pos).matches()) { + lastTense = Tense.PAST; + } else if (tensePos4Future != null && tensePos4Future.reset(pos).matches()) { + if (tenseWord4Future.reset(coveredText).matches()) { + lastTense = Tense.FUTURE; } } - if (token.getCoveredText().equals("since")) { - lastTense = "PAST"; - Logger.printDetail("this tense:"+lastTense); - } - if (token.getCoveredText().equals("depuis")) { // French - lastTense = "PAST"; - Logger.printDetail("this tense:"+lastTense); + if (coveredText.equals("since") || coveredText.equals("depuis")) { + lastTense = Tense.PAST; } } - if (lastTense.equals("")) { - if (tokEnd > timex.getEnd()) { - Token token = tmToken.get(tokEnd); - - Logger.printDetail("GET NEXT TENSE: string:"+token.getCoveredText()+" pos:"+token.getPos()); - Logger.printDetail("hmAllRePattern.containsKey(tensePos4PresentFuture):"+rpm.get("tensePos4PresentFuture")); - Logger.printDetail("hmAllRePattern.containsKey(tensePos4Future):"+rpm.get("tensePos4Future")); - Logger.printDetail("hmAllRePattern.containsKey(tensePos4Past):"+rpm.get("tensePos4Past")); - Logger.printDetail("CHECK TOKEN:"+token.getPos()); - - if (token.getPos() == null) { - - } - else if ((rpm.containsKey("tensePos4PresentFuture")) && (token.getPos().matches(rpm.get("tensePos4PresentFuture")))) { - lastTense = "PRESENTFUTURE"; - Logger.printDetail("this tense:"+lastTense); - } - else if ((rpm.containsKey("tensePos4Past")) && (token.getPos().matches(rpm.get("tensePos4Past")))) { - lastTense = "PAST"; - Logger.printDetail("this tense:"+lastTense); - } - else if ((rpm.containsKey("tensePos4Future")) && (token.getPos().matches(rpm.get("tensePos4Future")))) { - if (token.getCoveredText().matches(rpm.get("tenseWord4Future"))) { - lastTense = "FUTURE"; - Logger.printDetail("this tense:"+lastTense); - } + if (lastTense == null && token.getEnd() > timex.getEnd()) { + String pos = token.getPos(); + + if (LOG.isTraceEnabled()) { + LOG.trace("GET NEXT TENSE: string:" + token.getCoveredText() + " pos:" + pos); + LOG.trace("hmAllRePattern.containsKey(tensePos4PresentFuture):" + tensePos4PresentFuture.pattern().pattern()); + LOG.trace("hmAllRePattern.containsKey(tensePos4Future):" + tensePos4Future.pattern().pattern()); + LOG.trace("hmAllRePattern.containsKey(tensePos4Past):" + tensePos4Past.pattern().pattern()); + LOG.trace("CHECK TOKEN:" + pos); + } + + if (pos != null) { + if (tensePos4PresentFuture != null && tensePos4PresentFuture.reset(pos).matches()) { + lastTense = Tense.PRESENTFUTURE; + } else if (tensePos4Past != null && tensePos4Past.reset(pos).matches()) { + lastTense = Tense.PAST; + } else if (tensePos4Future != null && tensePos4Future.reset(pos).matches()) { + if (tenseWord4Future.reset(token.getCoveredText()).matches()) + lastTense = Tense.FUTURE; } } } + if (lastTense != null) + LOG.trace("this tense: {} {}", token.getCoveredText(), lastTense); } // check for double POS Constraints (not included in the rule language, yet) TODO // VHZ VNN and VHZ VNN and VHP VNN and VBP VVN String prevPos = ""; - String longTense = ""; - if (lastTense.equals("PRESENTFUTURE")) { - for (Integer tokEnd : tmToken.keySet()) { - if (tokEnd < timex.getBegin()) { - Token token = tmToken.get(tokEnd); - if (("VHZ".equals(prevPos)) || ("VBZ".equals(prevPos)) || ("VHP".equals(prevPos)) || ("VBP".equals(prevPos)) - || (prevPos.equals("VER:pres"))) { - if ("VVN".equals(token.getPos()) || "VER:pper".equals(token.getPos())) { - if ((!(token.getCoveredText().equals("expected"))) && (!(token.getCoveredText().equals("scheduled")))) { - lastTense = "PAST"; - longTense = "PAST"; - Logger.printDetail("this tense:"+lastTense); + Tense longTense = null; + if (lastTense == Tense.PRESENTFUTURE) { + for (Token token : tmToken) { + if (token.getEnd() < timex.getBegin()) { + String pos = token.getPos(); + if ("VHZ".equals(prevPos) || "VBZ".equals(prevPos) || "VHP".equals(prevPos) || "VBP".equals(prevPos) || prevPos.equals("VER:pres")) { + if ("VVN".equals(pos) || "VER:pper".equals(pos)) { + String covered = token.getCoveredText(); + if (!(covered.equals("expected")) && !(covered.equals("scheduled"))) { + lastTense = longTense = Tense.PAST; + LOG.debug("this tense: {}", lastTense); } } } - prevPos = token.getPos(); + prevPos = pos; } - if (longTense.equals("")) { - if (tokEnd > timex.getEnd()) { - Token token = tmToken.get(tokEnd); - if (("VHZ".equals(prevPos)) || ("VBZ".equals(prevPos)) || ("VHP".equals(prevPos)) || ("VBP".equals(prevPos)) - || ("VER:pres".equals(prevPos))) { - if ("VVN".equals(token.getPos()) || "VER:pper".equals(token.getPos())) { - if ((!(token.getCoveredText().equals("expected"))) && (!(token.getCoveredText().equals("scheduled")))) { - lastTense = "PAST"; - longTense = "PAST"; - Logger.printDetail("this tense:"+lastTense); - } + if (longTense == null && token.getEnd() > timex.getEnd()) { + if ("VHZ".equals(prevPos) || "VBZ".equals(prevPos) || "VHP".equals(prevPos) || "VBP".equals(prevPos) || "VER:pres".equals(prevPos)) { + if ("VVN".equals(token.getPos()) || "VER:pper".equals(token.getPos())) { + String covered = token.getCoveredText(); + if (!(covered.equals("expected")) && !(covered.equals("scheduled"))) { + lastTense = longTense = Tense.PAST; + LOG.debug("this tense: {}", lastTense); } } - prevPos = token.getPos(); } + prevPos = token.getPos(); } } } // French: VER:pres VER:pper - if (lastTense.equals("PAST")) { - for (Integer tokEnd : tmToken.keySet()) { - if (tokEnd < timex.getBegin()) { - Token token = tmToken.get(tokEnd); - if (("VER:pres".equals(prevPos)) && ("VER:pper".equals(token.getPos()))) { - if (((token.getCoveredText().matches("^prévue?s?$"))) || ((token.getCoveredText().equals("^envisagée?s?$")))) { - lastTense = "FUTURE"; - longTense = "FUTURE"; - Logger.printDetail("this tense:"+lastTense); - } + if (lastTense == Tense.PAST) { + for (Token token : tmToken) { + if (token.getEnd() < timex.getBegin()) { + String pos = token.getPos(); + if ("VER:pres".equals(prevPos) && "VER:pper".equals(pos)) { + if (PREVUE_ENVISAGEE.matcher(token.getCoveredText()).matches()) { + lastTense = longTense = Tense.FUTURE; + LOG.debug("this tense: {}", lastTense); + } } - prevPos = token.getPos(); + prevPos = pos; } - if (longTense.equals("")) { - if (tokEnd > timex.getEnd()) { - Token token = tmToken.get(tokEnd); - if (("VER:pres".equals(prevPos)) && ("VER:pper".equals(token.getPos()))) { - if (((token.getCoveredText().matches("^prévue?s?$"))) || ((token.getCoveredText().equals("^envisagée?s?$")))) { - lastTense = "FUTURE"; - longTense = "FUTURE"; - Logger.printDetail("this tense:"+lastTense); + if (longTense == null) { + if (token.getEnd() > timex.getEnd()) { + String pos = token.getPos(); + if ("VER:pres".equals(prevPos) && "VER:pper".equals(pos)) { + if (PREVUE_ENVISAGEE.matcher(token.getCoveredText()).matches()) { + lastTense = longTense = Tense.FUTURE; + LOG.debug("this tense: {}", lastTense); } } - prevPos = token.getPos(); + prevPos = pos; } } } } - Logger.printDetail("TENSE: "+lastTense); - + LOG.trace("TENSE: {}", lastTense); return lastTense; } - + /** - * Check token boundaries of expressions. - * @param r MatchResult - * @param s Respective sentence - * @return whether or not the MatchResult is a clean one + * Get the tokens close to the given timex (i.e. the same sentence). + * + * @param timex + * Timex + * @param jcas + * Cas + * @return Tokens, sorted by end. */ - public static Boolean checkInfrontBehind(MatchResult r, Sentence s) { - Boolean ok = true; - - // get rid of expressions such as "1999" in 53453.1999 - if (r.start() > 1) { - if ((s.getCoveredText().substring(r.start() - 2, r.start()).matches("\\d\\."))){ - ok = false; - } - } - - // get rid of expressions if there is a character or symbol ($+) directly in front of the expression - if (r.start() > 0) { - if (((s.getCoveredText().substring(r.start() - 1, r.start()).matches("[\\w\\$\\+]"))) && - (!(s.getCoveredText().substring(r.start() - 1, r.start()).matches("\\(")))){ - ok = false; - } - } - - if (r.end() < s.getCoveredText().length()) { - if ((s.getCoveredText().substring(r.end(), r.end() + 1).matches("[°\\w]")) && - (!(s.getCoveredText().substring(r.end(), r.end() + 1).matches("\\)")))){ - ok = false; - } - if (r.end() + 1 < s.getCoveredText().length()) { - if (s.getCoveredText().substring(r.end(), r.end() + 2).matches( - "[\\.,]\\d")) { - ok = false; - } - } + private static ArrayList getCloseTokens(Timex3 timex, JCas jcas) { + // Get the sentence + AnnotationIndex sentences = jcas.getAnnotationIndex(Sentence.type); + Sentence s = null; + for (FSIterator iterSentence = sentences.iterator(); iterSentence.hasNext();) { + s = iterSentence.next(); + if (s.getBegin() <= timex.getBegin() && s.getEnd() >= timex.getEnd()) + break; } - return ok; + + // Get the tokens + AnnotationIndex tokens = jcas.getAnnotationIndex(Token.type); + FSIterator iter = (s != null) ? tokens.subiterator(s) : tokens.iterator(); + ArrayList tmToken = new ArrayList(); + while (iter.hasNext()) + tmToken.add(iter.next()); + tmToken.sort(SORT_TOKENS); + return tmToken; } - + /** - * Check token boundaries using token information - * @param r MatchResult - * @param s respective Sentence - * @param jcas current CAS object - * @return whether or not the MatchResult is a clean one - */ - public static Boolean checkTokenBoundaries(MatchResult r, Sentence s, JCas jcas){ - Boolean beginOK = false; - Boolean endOK = false; - - // whole expression is marked as a sentence - if ((r.end() - r.start()) == (s.getEnd() -s.getBegin())){ - return true; - } - - // Only check Token boundaries if no white-spaces in front of and behind the match-result - if ((r.start() > 0) - && ((s.getCoveredText().subSequence(r.start()-1, r.start()).equals(" "))) - && ((r.end() < s.getCoveredText().length()) && ((s.getCoveredText().subSequence(r.end(), r.end()+1).equals(" "))))) { - return true; - } - - // other token boundaries than white-spaces - else { - FSIterator iterToken = jcas.getAnnotationIndex(Token.type).subiterator(s); - while (iterToken.hasNext()) { - Token t = (Token) iterToken.next(); - - // Check begin - if ((r.start() + s.getBegin()) == t.getBegin()){ - beginOK = true; - } - // Tokenizer does not split number from some symbols (".", "/", "-", "–"), - // e.g., "...12 August-24 Augsut..." - else if ((r.start() > 0) - && ((s.getCoveredText().subSequence(r.start()-1, r.start()).equals(".")) - || (s.getCoveredText().subSequence(r.start()-1, r.start()).equals("/")) - || (s.getCoveredText().subSequence(r.start()-1, r.start()).equals("–")) - || (s.getCoveredText().subSequence(r.start()-1, r.start()).equals("-")))) { - beginOK = true; - } - - // Check end - if ((r.end() + s.getBegin()) == t.getEnd()) { - endOK = true; - } - // Tokenizer does not split number from some symbols (".", "/", "-", "–"), - // e.g., "... in 1990. New Sentence ..." - else if ((r.end() < s.getCoveredText().length()) - && ((s.getCoveredText().subSequence(r.end(), r.end()+1).equals(".")) - || (s.getCoveredText().subSequence(r.end(), r.end()+1).equals("/")) - || (s.getCoveredText().subSequence(r.end(), r.end()+1).equals("–")) - || (s.getCoveredText().subSequence(r.end(), r.end()+1).equals("-")))) { - endOK = true; - } - - if (beginOK && endOK) - return true; - } + * Sort tokens by the token end. + */ + private static final Comparator SORT_TOKENS = new Comparator() { + public int compare(Token o1, Token o2) { + return Integer.compare(o1.getEnd(), o2.getEnd()); } - return false; - } + }; } diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/utilities/DateCalculator.java b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/DateCalculator.java index 0c9c531d..0f857342 100644 --- a/src/de/unihd/dbs/uima/annotator/heideltime/utilities/DateCalculator.java +++ b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/DateCalculator.java @@ -1,283 +1,294 @@ package de.unihd.dbs.uima.annotator.heideltime.utilities; -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.util.Calendar; +import java.text.ParsePosition; +import java.time.LocalDate; +import java.time.Year; +import java.time.YearMonth; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.format.DateTimeParseException; +import java.time.temporal.ChronoField; +import java.time.temporal.WeekFields; import java.util.Locale; -import de.unihd.dbs.uima.annotator.heideltime.resources.Language; -import de.unihd.dbs.uima.annotator.heideltime.resources.NormalizationManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + /** * * This class contains methods that rely on calendar functions to calculate data. + * * @author jannik stroetgen * */ public class DateCalculator { - - public static String getXNextYear(String date, Integer x){ - - // two formatters depending if BC or not - SimpleDateFormat formatter = new SimpleDateFormat("yyyy"); - SimpleDateFormat formatterBC = new SimpleDateFormat("GGyyyy"); - - String newDate = ""; - Calendar c = Calendar.getInstance(); - + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(DateCalculator.class); + + // two formatters depending if BC or not + // "u" allows 0, "y" does not (BC0001, then AD0001) + static final DateTimeFormatter YEARFORMATTER = DateTimeFormatter.ofPattern("uuuu", Locale.ROOT); + static final DateTimeFormatter YEARFORMATTERBC = DateTimeFormatter.ofPattern("GGyyyy", Locale.ROOT); + + static final DateTimeFormatter FORMATTER = DateTimeFormatter.ofPattern("uuuu-MM-dd", Locale.ROOT); + static final DateTimeFormatter FORMATTERBC = DateTimeFormatter.ofPattern("GGyyyy-MM-dd", Locale.ROOT); + + static final DateTimeFormatter MONTHFORMATTER = DateTimeFormatter.ofPattern("uuuu-MM", Locale.ROOT); + static final DateTimeFormatter MONTHFORMATTERBC = DateTimeFormatter.ofPattern("GGyyyy-MM", Locale.ROOT); + + static final DateTimeFormatter WEEKFORMATTER = new DateTimeFormatterBuilder().appendPattern("YYYY-['W']w").parseDefaulting(WeekFields.ISO.dayOfWeek(), 1).toFormatter(Locale.ROOT); + static final DateTimeFormatter WEEKFORMATTER_WIDE = new DateTimeFormatterBuilder().appendPattern("YYYY-['W']ww").parseDefaulting(WeekFields.ISO.dayOfWeek(), 1).toFormatter(Locale.ROOT); + + private static Year parseBC(String date) throws DateTimeParseException { + if (date.length() == 0) + throw new DateTimeParseException("Empty date string.", date, 0); + if ("BC0000".equals(date)) + return Year.of(-1); + return Year.from(((Character.isDigit(date.charAt(0)) ? YEARFORMATTER : YEARFORMATTERBC)// + .parse(date, new ParsePosition(0)))); + } + + public static String getXNextYear(String date, int x) { try { - // read the original date - if (date.matches("^\\d.*")){ - c.setTime(formatter.parse(date)); - } - else{ - c.setTime(formatterBC.parse(date)); - } - // make calucaltion - c.add(Calendar.YEAR, x); - c.getTime(); - // check if new date is BC or AD for choosing formatter or formatterBC - int newEra = c.get(Calendar.ERA); - if (newEra > 0){ - newDate = formatter.format(c.getTime()); - } - else{ - newDate = formatterBC.format(c.getTime()); - } + Year d = parseBC(date).plusYears(x); + d.get(ChronoField.ERA); + return d.format((d.get(ChronoField.ERA) == 1) ? YEARFORMATTER : YEARFORMATTERBC); + } catch (DateTimeParseException e) { + LOG.error(e.getMessage(), e); + return ""; } - catch (ParseException e) { - e.printStackTrace(); - } - return newDate; } - - public static String getXNextDecade(String date, Integer x) { - date = date + "0"; // deal with years not with centuries - - // two formatters depending if BC or not - SimpleDateFormat formatter = new SimpleDateFormat("yyyy"); - SimpleDateFormat formatterBC = new SimpleDateFormat("GGyyyy"); - - String newDate = ""; - Calendar c = Calendar.getInstance(); - + + public static String getXNextDecade(String date, int x) { try { - // read the original date - if (date.matches("^\\d.*")){ - c.setTime(formatter.parse(date)); - } - else{ - c.setTime(formatterBC.parse(date)); - } - - // make calucaltion - c.add(Calendar.YEAR, x*10); - c.getTime(); - - // check if new date is BC or AD for choosing formatter or formatterBC - int newEra = c.get(Calendar.ERA); - if (newEra > 0){ - newDate = formatter.format(c.getTime()).substring(0, 3); - } - else{ - newDate = formatterBC.format(c.getTime()).substring(0, 5); - } - - } catch (ParseException e) { - e.printStackTrace(); + date = date + "0"; // deal with years not with centuries + Year d = parseBC(date).plusYears(10 * x); + return d.format((d.get(ChronoField.ERA) == 1) ? YEARFORMATTER : YEARFORMATTERBC); + } catch (DateTimeParseException e) { + LOG.error(e.getMessage(), e); + return ""; } - return newDate; } - - - public static String getXNextCentury(String date, Integer x) { - date = date + "00"; // deal with years not with centuries - int oldEra = 0; // 0 if BC date, 1 if AD date - - // two formatters depending if BC or not - SimpleDateFormat formatter = new SimpleDateFormat("yyyy"); - SimpleDateFormat formatterBC = new SimpleDateFormat("GGyyyy"); - - String newDate = ""; - Calendar c = Calendar.getInstance(); - + + public static String getXNextCentury(String date, int x) { try { - // read the original date - if (date.matches("^\\d.*")){ - c.setTime(formatter.parse(date)); - oldEra = 1; - } - else{ - c.setTime(formatterBC.parse(date)); - } - - // make calucaltion - c.add(Calendar.YEAR, x*100); - c.getTime(); - + date = date + "00"; // deal with years not with centuries + Year d = parseBC(date); + int oldEra = d.get(ChronoField.ERA); + d = d.plusYears(x * 100); + // check if new date is BC or AD for choosing formatter or formatterBC - int newEra = c.get(Calendar.ERA); - if (newEra > 0){ - if (oldEra == 0){ + int newEra = d.get(ChronoField.ERA); + if (newEra == 1) { + if (oldEra == 0) { // -100 if from BC to AD - c.add(Calendar.YEAR, -100); - c.getTime(); + d = d.minusYears(100); } - newDate = formatter.format(c.getTime()).substring(0, 2); - } - else{ - if (oldEra > 0){ + return d.format(YEARFORMATTER).substring(0, 2); + } else { + if (oldEra == 1) { // +100 if from AD to BC - c.add(Calendar.YEAR, 100); - c.getTime(); + d = d.plusYears(100); } - newDate = formatterBC.format(c.getTime()).substring(0, 4); + return d.format(YEARFORMATTERBC).substring(0, 4); } - - } catch (ParseException e) { - e.printStackTrace(); + + } catch (DateTimeParseException e) { + LOG.error(e.getMessage(), e); + return ""; } - return newDate; } - + /** * get the x-next day of date. * - * @param date given date to get new date from - * @param x type of temporal event to search for + * @param date + * given date to get new date from + * @param x + * type of temporal event to search for * @return */ - public static String getXNextDay(String date, Integer x) { - SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); - String newDate = ""; - Calendar c = Calendar.getInstance(); + public static String getXNextDay(String date, int x) { try { - c.setTime(formatter.parse(date)); - c.add(Calendar.DAY_OF_MONTH, x); - c.getTime(); - newDate = formatter.format(c.getTime()); - } catch (ParseException e) { - e.printStackTrace(); + return LocalDate.from(FORMATTER.parse(date, new ParsePosition(0))).plusDays(x).format(FORMATTER); + } catch (DateTimeParseException e) { + LOG.error(e.getMessage(), e); + return ""; + } + } + + /** + * get the x-next day of date. + * + * @param date + * given date to get new date from + * @param x + * type of temporal event to search for + * @return + */ + public static String getXNextDay(int year, int month, int days, int x) { + try { + return LocalDate.of(year, month, days).plusDays(x).format(FORMATTER); + } catch (DateTimeParseException e) { + LOG.error(e.getMessage(), e); + return ""; } - return newDate; } /** * get the x-next month of date * - * @param date current date - * @param x amount of months to go forward + * @param date + * current date + * @param x + * amount of months to go forward * @return new month */ - public static String getXNextMonth(String date, Integer x) { + public static String getXNextMonth(String date, int x) { + try { + YearMonth d = YearMonth.from((Character.isDigit(date.charAt(0)) ? MONTHFORMATTER : MONTHFORMATTERBC).parse(date, new ParsePosition(0))).plusMonths(x); + return d.format((d.get(ChronoField.ERA) == 1) ? MONTHFORMATTER : MONTHFORMATTERBC); - // two formatters depending if BC or not - SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM"); - SimpleDateFormat formatterBC = new SimpleDateFormat("GGyyyy-MM"); - String newDate = ""; - Calendar c = Calendar.getInstance(); + } catch (DateTimeParseException e) { + LOG.error(e.getMessage(), e); + return ""; + } + } + /** + * get the x-next week of date + * + * @param date + * current date + * @param x + * amount of weeks to go forward + * @return new week + */ + public static String getXNextWeek(String date, int x) { try { - // read the original date - if (date.matches("^\\d.*")){ - c.setTime(formatter.parse(date)); - } - else{ - c.setTime(formatterBC.parse(date)); - } - // make calucaltion - c.add(Calendar.MONTH, x); - c.getTime(); - - // check if new date is BC or AD for choosing formatter or formatterBC - int newEra = c.get(Calendar.ERA); - if (newEra > 0){ - newDate = formatter.format(c.getTime()); - } - else{ - newDate = formatterBC.format(c.getTime()); - } - - } - catch (ParseException e) { - e.printStackTrace(); + LocalDate d = LocalDate.from(WEEKFORMATTER.parse(date, new ParsePosition(0))).plusWeeks(x); + return d.format(WEEKFORMATTER_WIDE); + } catch (DateTimeParseException e) { + LOG.error(e.getMessage(), e); + return ""; } - return newDate; } - + /** * get the x-next week of date - * @param date current date - * @param x amount of weeks to go forward + * + * @param date + * current date + * @param x + * amount of weeks to go forward * @return new week */ - public static String getXNextWeek(String date, Integer x, Language language) { - NormalizationManager nm = NormalizationManager.getInstance(language, false); - String date_no_W = date.replace("W", ""); - SimpleDateFormat formatter = new SimpleDateFormat("yyyy-w"); - String newDate = ""; - Calendar c = Calendar.getInstance(); + public static String getXNextWeek(int year, int month, int day, int x) { try { - c.setTime(formatter.parse(date_no_W)); - c.add(Calendar.WEEK_OF_YEAR, x); - c.getTime(); - newDate = formatter.format(c.getTime()); - newDate = newDate.substring(0,4)+"-W"+nm.getFromNormNumber(newDate.substring(5)); - } catch (ParseException e) { - e.printStackTrace(); + LocalDate d = LocalDate.of(year, month, day).plusWeeks(x); + return d.format(WEEKFORMATTER_WIDE); + } catch (DateTimeParseException e) { + LOG.error(e.getMessage(), e); + return ""; } - return newDate; } /** * Get the weekday of date * - * @param date current date + * Important: with the switch to Java 8, sunday became 7 rather than 1! + * + * @param date + * current date * @return day of week */ public static int getWeekdayOfDate(String date) { - SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); - int weekday = 0; - Calendar c = Calendar.getInstance(); try { - c.setTime(formatter.parse(date)); - weekday = c.get(Calendar.DAY_OF_WEEK); - } catch (ParseException e) { - e.printStackTrace(); + return LocalDate.from(FORMATTER.parse(date, new ParsePosition(0))).getDayOfWeek().getValue(); + } catch (DateTimeParseException e) { + LOG.error(e.getMessage(), e); + return 0; } - return weekday; + } + + /** + * Get the weekday of date + * + * Important: with the switch to Java 8, sunday became 7 rather than 1! + * + * @param year + * Year + * @param month + * Month + * @param day + * Day of Month + * @return day of week + */ + public static int getWeekdayOfDate(int year, int month, int day) { + return LocalDate.of(year, month, day).getDayOfWeek().getValue(); } /** * Get the week of date * - * @param date current date + * @param date + * current date * @return week of year */ public static int getWeekOfDate(String date) { - SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); - int week = 0; - ; - Calendar c = Calendar.getInstance(); try { - c.setTime(formatter.parse(date)); - week = c.get(Calendar.WEEK_OF_YEAR); - } catch (ParseException e) { - e.printStackTrace(); + return LocalDate.from(FORMATTER.parse(date, new ParsePosition(0))).get(WeekFields.ISO.weekOfWeekBasedYear()); + } catch (DateTimeParseException e) { + LOG.error(e.getMessage(), e); + return 0; } - return week; } - + /** - * takes a desired locale input string, iterates through available locales, returns a locale object - * @param locale String to grab a locale for, i.e. en_US, en_GB, de_DE - * @return Locale to represent the input String + * Get the week of date + * + * @param year + * Year + * @param month + * Month + * @param day + * Day of Month + * @return week of year */ - public static Locale getLocaleFromString(String locale) throws LocaleException { - for(Locale l : Locale.getAvailableLocales()) { - if(locale.toLowerCase().equals(l.toString().toLowerCase())) { - return l; - } - } - throw new LocaleException(); + public static int getWeekOfDate(int year, int month, int day) { + return LocalDate.of(year, month, day).get(WeekFields.ISO.weekOfWeekBasedYear()); + } + + /** + * Get the quarter of the year, as string. + * + * @param dctMonth + * Month + * @return Quarter + */ + public static String getQuarterOfMonth(int dctMonth) { + return dctMonth <= 3 ? "Q1" : dctMonth <= 6 ? "Q2" : dctMonth <= 9 ? "Q3" : "Q4"; + } + + /** + * Get the half year, as string + * + * @param dctMonth + * Month + * @return Half year + */ + public static String getHalfYearOfMonth(int dctMonth) { + return (dctMonth <= 6) ? "H1" : "H2"; + } + + /** + * Get the season of a month, as string + * + * @param dctMonth + * Month + * @return Season + */ + public static Season getSeasonOfMonth(int dctMonth) { + return dctMonth <= 2 ? Season.WINTER : dctMonth <= 5 ? Season.SPRING : dctMonth <= 8 ? Season.SUMMER : dctMonth <= 11 ? Season.FALL : Season.WINTER; } } diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/utilities/DurationSimplification.java b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/DurationSimplification.java new file mode 100644 index 00000000..66d4cae5 --- /dev/null +++ b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/DurationSimplification.java @@ -0,0 +1,40 @@ +package de.unihd.dbs.uima.annotator.heideltime.utilities; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class DurationSimplification { + private static final Pattern SIMPLIFY_DURATION = Pattern.compile("(PT?)(\\d+)([HM])"); + + /** + * Durations of a finer granularity are mapped to a coarser one if possible, e.g., "PT24H" -> "P1D". One may add several further corrections. + * + * @param value + * @return + */ + public static String simplify(String value) { + Matcher m = SIMPLIFY_DURATION.matcher(value); + if (m.matches()) { + int ival = Integer.parseInt(m.group(2)); + String g1 = m.group(1), g3 = m.group(3); + if (g1.equals("PT")) { + // x*24 hours to x days + if (g3.equals("H") && (ival % 24 == 0)) + return "P" + (ival / 24) + "D"; + // x*60 minutes to x days + if (g3.equals("M") && (ival % 60 == 0)) { + // x*60*24 minutes to x days + if (ival % 1440 == 0) + return "P" + (ival / 1440) + "D"; + return "PT" + (ival / 60) + "H"; + } + } else if (g1.equals("P")) { + // x*12 months to years + if (g3.equals("M") && (ival % 12 == 0)) + return "P" + (ival / 12) + "Y"; + + } + } + return value; + } +} \ No newline at end of file diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/utilities/IntArrayList.java b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/IntArrayList.java new file mode 100644 index 00000000..5fec53a1 --- /dev/null +++ b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/IntArrayList.java @@ -0,0 +1,120 @@ +package de.unihd.dbs.uima.annotator.heideltime.utilities; + +import java.util.Arrays; + +/** + * Efficient replacement for {@code ArrayList}. + * + * We intentionally do not implement {@code Collection}! + * + * @author Erich Schubert + */ +public class IntArrayList { + /** Data storage */ + int[] data; + /** Used storage */ + int len; + + /** + * Constructor. + */ + public IntArrayList() { + this(11); + } + + /** + * Constructor. + * + * @param capacity + * Capacity + */ + public IntArrayList(int capacity) { + data = new int[capacity]; + } + + /** + * Size of list. + * + * @return size + */ + public int size() { + return len; + } + + /** + * Get an entry. + * + * @param p + * Position + * @return Value + */ + public int get(int p) { + assert (p < len); + return data[p]; + } + + /** + * Append a value. + * + * @param v + * Value to append. + */ + public void add(int v) { + if (data.length == len) { + int newlen = len < 5 ? 11 : (len + (len >> 1)) | 1; + data = Arrays.copyOf(data, newlen); + } + data[len++] = v; + } + + /** + * Sort the list. + */ + public void sort() { + Arrays.sort(data, 0, len); + } + + /** + * Sort and remove duplicates. + */ + public void sortRemoveDuplicates() { + if (len == 0) + return; + int prev = data[0]; + int p = 1; + for (int i = 1; i < len; i++) { + int next = data[i]; + if (next == prev) + continue; + prev = data[p++] = next; + } + len = p; + } + + /** + * Perform a binary search (array must be sorted). + * + * @param key + * Key to search + * @return Position. Negative values indicate insertion positions. + */ + public int binarySearch(int key) { + return Arrays.binarySearch(data, 0, len, key); + } + + /** + * Clear the list. + */ + public void clear() { + len = 0; + } + + /** + * Check if the list is empty. + * + * @return + */ + public boolean isEmpty() { + return len == 0; + } +} \ No newline at end of file diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/utilities/Logger.java b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/Logger.java deleted file mode 100644 index 11b70174..00000000 --- a/src/de/unihd/dbs/uima/annotator/heideltime/utilities/Logger.java +++ /dev/null @@ -1,81 +0,0 @@ -package de.unihd.dbs.uima.annotator.heideltime.utilities; -/** - * Logger class to facilitate a centralized logging effort. Upon initialization of - * the HeidelTime annotator, the verbosity (printDetails) should be set; any kind of - * output should be done using either the printDetail()-methods for DEBUG-Level, - * conditional output or the printError()-methods for ERROR-Level, unconditional - * output. - * @author julian zell - * - */ -public class Logger { - private static Boolean printDetails = false; - - /** - * Controls whether DEBUG-Level information is printed or not - * @param printDetails to print or not to print - */ - public static void setPrintDetails(Boolean printDetails) { - Logger.printDetails = printDetails; - } - - /** - * print DEBUG level information with package name - * @param component Component from which the message originates - * @param msg DEBUG-level message - */ - public static void printDetail(Class c, String msg) { - if(Logger.printDetails) { - String preamble; - if(c != null) - preamble = "["+c.getSimpleName()+"]"; - else - preamble = ""; - - synchronized(System.err) { - System.err.println(preamble+" "+msg); - } - } - } - - /** - * no-package proxy method - * @param msg DEBUG-Level message - */ - public static void printDetail(String msg) { - printDetail(null, msg); - } - - /** - * print an ERROR-Level message with package name - * @param component Component from which the message originates - * @param msg ERROR-Level message - */ - public static void printError(Class c, String msg) { - String preamble; - if(c != null) - preamble = "["+c.getSimpleName()+"]"; - else - preamble = ""; - - synchronized(System.err) { - System.err.println(preamble+" "+msg); - } - } - - /** - * no-package proxy method - * @param msg ERROR-Level message - */ - public static void printError(String msg) { - printError(null, msg); - } - - - /** - * Outputs whether DEBUG-Level information is printed or not - */ - public static Boolean getPrintDetails() { - return printDetails; - } -} diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/utilities/ParseInteger.java b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/ParseInteger.java new file mode 100644 index 00000000..b6e2ce25 --- /dev/null +++ b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/ParseInteger.java @@ -0,0 +1,67 @@ +package de.unihd.dbs.uima.annotator.heideltime.utilities; + +public class ParseInteger { + private ParseInteger() { + // Utility class, use static methods. + } + + /** + * Parse an integer within a string. + * + * This is derived from {@link Integer#parseInt(String)}, but allows using subsequences of an arbitrary CharSequence. + * + * @param s + * String + * @param b + * Start + * @param e + * End (exclusive) + * @return Integer + */ + public static int parseInt(CharSequence s, int b, int e) { + if (s == null) + throw new NumberFormatException("null"); + if (e <= b) + throw new NumberFormatException("Empty string"); + + int result = 0; + boolean negative = false; + int limit = -Integer.MAX_VALUE; + int multmin = limit / 10; + int digit; + + int i = b; + char firstChar = s.charAt(i); + if (firstChar < '0') { // Possible leading "+" or "-" + if (firstChar == '-') { + negative = true; + limit = Integer.MIN_VALUE; + } else if (firstChar != '+') + throw new NumberFormatException("For input string: \"" + s.subSequence(b, e) + "\""); + i++; + if (i == e) // Cannot have lone "+" or "-" + throw new NumberFormatException("lone + or -"); + } + while (i < e) { + // Accumulating negatively avoids surprises near MAX_VALUE + digit = Character.digit(s.charAt(i++), 10); + if (digit < 0) + throw new NumberFormatException("For input string: \"" + s.subSequence(b, e) + "\""); + if (result < multmin) + throw new NumberFormatException("For input string: \"" + s.subSequence(b, e) + "\""); + result *= 10; + if (result < limit + digit) + throw new NumberFormatException("For input string: \"" + s.subSequence(b, e) + "\""); + result -= digit; + } + return negative ? result : -result; + } + + public static int parseInt(CharSequence s) { + return parseInt(s, 0, s.length()); + } + + public static int parseIntAt(CharSequence s, int b) { + return parseInt(s, b, s.length()); + } +} diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/utilities/RegexpOptimizer.java b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/RegexpOptimizer.java new file mode 100644 index 00000000..4cb986dd --- /dev/null +++ b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/RegexpOptimizer.java @@ -0,0 +1,729 @@ +package de.unihd.dbs.uima.annotator.heideltime.utilities; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Try to optimize constructed regexps for performance. + * + * This is currently an ugly hack, and only supports a very limited subset of regular expressions. + * + * In particular, only non-capturing groups are supported. It even has a hard-coded list (see method {@code #isSimple} of characters permitted. + * + * Don't use it on regexps that expand massively, such as phone numbers! + * + * This needs to eventually be rewritten into a more general tool, and with a proper regexp parser. + * + * @author Erich Schubert + */ +public class RegexpOptimizer { + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(RegexpOptimizer.class); + + /** + * Class when unsupported constructs are used in the optimizer. + * + * @author Erich Schubert + */ + public static class OptimizerException extends Exception { + /** Serialization version */ + private static final long serialVersionUID = 1L; + + /** + * Constructor. + * + * @param m + * Error message. + */ + public OptimizerException(String m) { + super(m); + } + } + + @FunctionalInterface + public static interface Consumer { + void accept(CharSequence str) throws OptimizerException; + } + + public static void expandPatterns(String s, Consumer out) throws OptimizerException { + expandPatterns(s, 0, s.length(), new StringBuilder(), out); + } + + public static void expandPatterns(String s, int i, int len, StringBuilder b, Consumer out) throws OptimizerException { + if (i >= len) { + out.accept(b); + return; + } + char cur = s.charAt(i); + if (isSimple(cur)) { + int l = b.length(); + b.append(cur); + expandPatterns(s, i + 1, len, b, out); + b.setLength(l); + return; + } + if (cur == '\\') { + // Escape character. + if (i == len) + throw new OptimizerException("Last character was an escape in"); + int l = b.length(); + b.append(cur).append(s.charAt(i + 1)); + expandPatterns(s, i + 2, len, b, out); + b.setLength(l); + return; + } + if (cur == '?') { + // Previous character was optional. + int l = b.length(); + if (l == 0) + throw new OptimizerException("First character was a question mark"); + if (isSimple(b.charAt(l - 1)) && (l == 1 || isSimple(b.charAt(l - 2)))) { + // Expand with + expandPatterns(s, i + 1, len, b, out); + b.setLength(l - 1); + // Expand without. + expandPatterns(s, i + 1, len, b, out); + b.setLength(l - 1); + } else { + b.append('?'); + expandPatterns(s, i + 1, len, b, out); + b.setLength(l - 1); + } + return; + } + if (cur == '[') { + int end = i + 1, nextp = -1; + boolean simple = true; + String optional = null; + for (; end < len; end++) { + char next = s.charAt(end); + if (next == ']') { + nextp = end + 1; + if (end + 1 < len && s.charAt(end + 1) == '?') { + optional = "?"; + nextp++; + // Possessive + if (end + 2 < len && s.charAt(end + 2) == '+') { + optional = "?+"; + nextp++; + } + } + break; + } + if (next == '-') + if (end != i + 1 && (end + 1 < len && s.charAt(end + 1) != ']')) + simple = false; + if (next == '[') + throw new OptimizerException("Nested ["); + if (next == '\\') + throw new OptimizerException("Escaped chars"); + } + if (end >= len || s.charAt(end) != ']') + throw new OptimizerException("Did not find matching []"); + final int l = b.length(); + if (simple) { + // Expand simple character ranges: + if (optional != null) { + // FIXME: retain possessive? + expandPatterns(s, nextp, len, b, out); + b.setLength(l); + } + for (int j = i + 1; j < end; j++) { + char c = s.charAt(j); + if (c == '.') + b.append('\\'); + b.append(c); + expandPatterns(s, nextp, len, b, out); + b.setLength(l); + } + } else { + assert (s.charAt(i) == '[' && s.charAt(end) == ']') : s.substring(i, nextp); + if (end - i < 4) { + // System.err.println("*****X " + s.substring(i, nextp) + " " + optional); + } else if (end - i == 4) { + // System.err.println("****** " + s.substring(i, nextp) + " " + optional); + char c1 = s.charAt(i + 1), c2 = s.charAt(i + 2), c3 = s.charAt(i + 3); + // Expand small ranges + if (c2 == '-' && isSimple(c1) && isSimple(c3) && c1 < c3 && c3 - c1 < 10) { + if (optional != null) { + // FIXME: retain possessive? + expandPatterns(s, nextp, len, b, out); + b.setLength(l); + } + for (char c = c1; c <= c3; c++) { + if (!isSimple(c)) { + throw new OptimizerException("Non-simple char in char range: " + c); + } + b.append(c); + expandPatterns(s, nextp, len, b, out); + b.setLength(l); + } + } + return; + } + // We simply copy&paste more complex character ranges + for (int j = i; j < nextp; j++) + b.append(s.charAt(j)); + expandPatterns(s, nextp, len, b, out); + b.setLength(l); + } + return; + } + if (cur == '(') { + int end = i + 1, begin = i + 1, nextp = -1; + int depth = 1; + boolean simple = true; + String optional = null; + for (int r = 0; end < len; end++, r++) { + char next = s.charAt(end); + if (r == 0) { + if (next != '?') + throw new OptimizerException("Non-optional group"); + ++begin; + continue; + } + if (r == 1) { + if (next != ':') + throw new OptimizerException("Non-optional group"); + ++begin; + continue; + } + if (next == ')' && --depth == 0) { + nextp = end + 1; + // Trailing modifiers + if (end + 1 < len && s.charAt(end + 1) == '?') { + optional = "?"; + nextp++; + // Possessive + if (end + 2 < len && s.charAt(end + 2) == '+') { + optional = "?+"; + nextp++; + } + } + break; + } + if (next == '\\') { + simple = false; + if (end + 1 == len) + throw new OptimizerException("Escape at end of group?!?"); + ++end; + } + if (next == '[' || next == '?' || next == '*' || next == '\\') { + simple = false; + // throw new ExpansionException("Special char " + next + " in group"); + } + if (next == '(') { + ++depth; + simple = false; + } + } + if (end >= len || s.charAt(end) != ')') + throw new OptimizerException("Did not find matching '()'"); + if (simple) { + int l = b.length(); + if (optional != null) { + expandPatterns(s, nextp, len, b, out); + b.setLength(l); + } + for (int j = begin; j < end; j++) { + char c = s.charAt(j); + if (c == '|') { + expandPatterns(s, nextp, len, b, out); + b.setLength(l); + continue; + } + b.append(c); + } + expandPatterns(s, nextp, len, b, out); + b.setLength(l); + return; + } + // Non-simple expansion: + // LOG.trace("Need to expand: " + s.substring(begin - 3, begin) + ">>" + s.substring(begin, end) + "<<" + s.substring(end, nextp)); + assert (depth == 0); + depth = 0; + int l = b.length(); + if (optional != null) { + expandPatterns(s, nextp, len, b, out); + b.setLength(l); + } + final int cont = nextp; // Make effectively final. + int prev = begin; + for (int j = begin; j < end; j++) { + char c = s.charAt(j); + if (c == '|' && depth == 0) { + // LOG.trace("Need to expand: " + s.substring(prev, j)); + expandPatterns(s, prev, j, new StringBuilder(), x -> { + // LOG.trace("Recursive expansion to: " + x); + b.append(x); + expandPatterns(s, cont, len, b, out); + b.setLength(l); + }); + prev = j + 1; + } else if (c == '(') + ++depth; + else if (c == ')') + --depth; + else if (c == '\\') + ++j; + } + if (depth != 0) + throw new OptimizerException("Could not close () group."); + expandPatterns(s, prev, end, new StringBuilder(), x -> { + // System.err.println("Recursive expansion to: " + x); + b.append(x); + expandPatterns(s, cont, len, b, out); + b.setLength(l); + }); + return; + } + throw new OptimizerException("Unhandled character " + cur + " at " + s.substring(Math.max(0, i - 5), Math.min(s.length(), i + 5))); + } + + private static boolean isSimple(char cur) { + return cur == ' ' || cur == '\'' || cur == '&' || cur == '-' || cur == ',' || Character.isAlphabetic(cur) || Character.isDigit(cur); + } + + private static final Comparator upperLowerChar = new Comparator() { + public int compare(String o1, String o2) { + int l1 = o1.length(), l2 = o2.length(); + int l = l1 < l2 ? l1 : l2; + for (int i = 0; i < l; i++) { + char c1 = o1.charAt(i), c2 = o2.charAt(i); + if (c1 != c2) { + char d1 = Character.toLowerCase(c1), d2 = Character.toLowerCase(c2); + return (d1 == d2) ? Character.compare(c1, c2) : Character.compare(d1, d2); + } + } + return l1 < l2 ? -1 : l1 == l2 ? 0 : +1; + } + }; + + public static String combinePatterns(Collection patterns) throws OptimizerException { + String[] ps = patterns.toArray(new String[patterns.size()]); + Arrays.sort(ps, upperLowerChar); + // Remove duplicates: + int l = 0; + String last = null; + for (int i = 0; i < ps.length; i++) { + String cand = ps[i]; + if (cand.equals(last)) { + continue; + } + ps[l++] = cand; + last = cand; + } + if (l < ps.length) + LOG.trace("Removed {} duplicate strings.", ps.length - l); + if (l == 0) + return ""; + ArrayList toplevel = new ArrayList<>(); + build(ps, 0, l, 0, x -> toplevel.add(x.toString())); + StringBuilder buf = new StringBuilder(); + buildGroup(toplevel.toArray(new String[toplevel.size()]), 0, toplevel.size(), 0, 0, x -> { + assert (buf.length() == 0); + buf.append(x); + }, new StringBuilder(), new StringBuilder()); + return buf.toString(); + } + + private static void build(String[] ps, int start, int end, int knownl, Consumer out) throws OptimizerException { + String k = ps[start]; + // assert (k.length() > knownl) : "Duplicates not removed?"; + // Only one string remaining: + if (start + 1 == end) { + if (knownl == k.length()) { + out.accept(""); + return; + } + char next = k.charAt(knownl); + if (next == '*' || next == '?' || next == '+') { + throw new OptimizerException("Bad split: " + k.substring(0, knownl) + "<<>>" + k.substring(knownl)); + } + out.accept(k.substring(knownl)); + return; + } + int l = knownl < k.length() ? nextLength(k, knownl) : knownl; + // System.err.println("Next length: " + l + " in " + k); + StringBuilder buf1 = new StringBuilder(), buf2 = new StringBuilder(); + int begin = start, pos = start; + while (pos < end) { + String cand = ps[pos]; + if (k.regionMatches(0, cand, 0, l)) { + ++pos; + continue; + } + buildGroup(ps, begin, pos, knownl, l, out, buf1, buf2); + k = cand; + begin = pos; + l = nextLength(k, knownl); + // System.err.println("Next length: " + l + " in " + k); + } + if (begin < pos) { + buildGroup(ps, begin, pos, knownl, l, out, buf1, buf2); + } + } + + private static void buildGroup(String[] ps, int begin, int end, int subbegin, int subend, Consumer out, StringBuilder buf, StringBuilder tmp) throws OptimizerException { + String key = ps[begin]; + // One element "group": + if (begin + 1 == end) { + buf.setLength(0); + buf.append(key, subbegin, key.length()); + out.accept(buf); + return; + } + // Two element "group": + if (begin + 2 == end) { + int p = prefixLength(ps, begin, end, subend); + String other = ps[begin + 1]; + assert (!other.equals(key)) : "Duplicates not removed?"; + buf.setLength(0); + buf.append(key, subbegin, p); + if (p == key.length()) { + if (p + 1 == other.length()) { + buf.append(other.charAt(p)).append('?'); + } else { + buf.append("(?:").append(other, p, other.length()).append(")?"); + } + } else { + buf.append("(?:").append(key, p, key.length()).append('|'); + buf.append(other, p, other.length()).append(')'); + } + LOG.trace("buildGroup two-element case: {}", buf); + out.accept(buf); + return; + } + // So we have at least three strings now. + // The basic pattern we build is: ((|)) + // This should probably be handled by clever recursion eventually... + // Challenges arise because of pattern optimizations, such as character groups. + // Skip a prefix if shared by all strings: + assert (subend <= key.length()) : key + " " + subbegin + "-" + subend; + // p is the first position where they differ. + int prefixend = prefixLength(ps, begin, end, subend), midfixend = prefixend; + // Prefix alone is a valid pattern: + final boolean prefixOnly; + if (prefixOnly = (key.length() == prefixend)) { + ++begin; + // Find midfix: + midfixend = prefixLength(ps, begin, end, prefixend); + key = ps[begin]; + } + // All remaining patterns will begin with midfix now. + // Expand all patterns, starting at the midfix position: + ArrayList cs = new ArrayList<>(); + build(ps, begin, end, midfixend, x -> cs.add(x.toString())); + // Find a common postfix: + String postfix = findPostfix(cs); + // Check if we have an entry "": + boolean innerGroupOptional = cs.remove(postfix); + if (cs.isEmpty()) { + assert (prefixOnly && innerGroupOptional); + // Simply ? + buf.setLength(0); + buf.append(key, subbegin, prefixend); // Add prefix + buf.append("(?:"); + buf.append(key, prefixend, midfixend); // Midfix + assert (postfix.charAt(postfix.length() - 1) == '?'); // Must be optional + buf.append(postfix); + buf.append(")?"); // prefixOnly + LOG.trace("buildGroup degenerate case: {}", buf); + out.accept(buf); + return; + } + // Special case: the remaining difference is a single character: + if (sameLength(cs, 1 + postfix.length())) { + // Build the inner group in tmp as: [a-z] + tmp.setLength(0); + tmp.append(key, prefixend, midfixend); // Midfix + if (cs.size() == 1) { + // Single (optional) character: + tmp.append(cs.get(0).charAt(0)); + } else { + // Build character range (if more than one character): + tmp.append('['); + for (int i = 0; i < cs.size(); i++) { + tmp.append(cs.get(i).charAt(0)); + } + mergeCharRanges(tmp, 1); // Ignoring "[" + tmp.append(']'); + } + if (innerGroupOptional) { + tmp.append('?'); // May be optional + } + tmp.append(postfix); // We simply add the postfix, too. + // tmp now is: [a-z] + // Basic pattern to build: ()? + buf.setLength(0); + buf.append(key, subbegin, prefixend); // Add prefix + // We only need the group for prefixOnly AND (midfix or postfix). + if (prefixOnly && (prefixend != midfixend || !postfix.isEmpty())) { + // Pattern: (?:)? + buf.append("(?:"); + buf.append(tmp); // Char range + buf.append(")?"); // close prefixOnly=true group + } else { + // Pattern: <[tmp]>? + buf.append(tmp); // Char range (cannot have a '?') + if (prefixOnly) { + assert (!innerGroupOptional); + buf.append('?'); // prefixOnly = true! + } + } + LOG.trace("buildGroup sameLength case: {}", buf); + out.accept(buf); + return; + } + // At this point, at least one case has length 2! + // We may need groups because: + // 1. prefixOnly == true + // 2. innerGroupOptional == true + // Pattern: (?:(?:|)) + // If midfix and postfix are empty, then we only need the inner group, even if prefixOnly + final boolean outerParentheses = prefixOnly && (midfixend != prefixend || !postfix.isEmpty()); + buf.setLength(0); + buf.append(key, subbegin, prefixend); + if (outerParentheses) { + buf.append("(?:"); + } + buf.append(key, prefixend, midfixend); + buf.append("(?:"); // inner + + // Merge subsequent alternatives with a common postfix except the first char into char ranges: + for (int i = 0; i < cs.size(); i++) { + String wi = cs.get(i); + if (wi == null) { + continue; + } + assert (wi.length() > 0); + // Collect the letters if the postfix matches: + tmp.setLength(0); + tmp.append(wi.charAt(0)); + for (int j = i + 1; j < cs.size(); j++) { + String wj = cs.get(j); + if (wj == null || wi.length() != wj.length()) { + continue; + } + char cj = wj.charAt(0); + if (isSimple(cj) && wi.regionMatches(1, wj, 1, wi.length() - 1)) { + tmp.append(cj); + cs.set(j, null); + } + } + // Separate alternatives + if (i > 0) { + buf.append('|'); + } + if (tmp.length() > 1) { + mergeCharRanges(tmp, 0); + buf.append('[').append(tmp).append(']'); + buf.append(wi, 1, wi.length() - postfix.length()); + } else { + buf.append(wi, 0, wi.length() - postfix.length()); + } + } + buf.append(')'); // Close inner group. + if (innerGroupOptional) { + buf.append('?'); + } + buf.append(postfix); + if (outerParentheses) { + buf.append(')'); + } + if (prefixOnly) { + assert (outerParentheses || !innerGroupOptional); + assert (buf.charAt(buf.length() - 1) == ')'); + buf.append('?'); + } + LOG.trace("buildGroup base case: {}", buf); + out.accept(buf); + } + + /** + * Check if all strings have the same length. + * + * @param cs + * Collection + * @param len + * Required length + * @return {@code true} if all have the same length. + */ + private static boolean sameLength(Collection cs, int len) { + for (String s : cs) { + if (s.length() != len) { + return false; + } + } + return true; + } + + /** + * Merge subsequent character ranges, if more than 2. + * + * E.g. convert "01234" -> "0-4" + * + * @param chars + * Character buffer + * @param start + * Starting position (set to 1 if you already have '[' in the buffer) + */ + private static void mergeCharRanges(StringBuilder chars, int start) { + // Build ranges: + for (int i = start; i < chars.length();) { + char c = chars.charAt(i); + int j = i + 1; + while (j < chars.length() && chars.charAt(j) == ++c && isSimple(chars.charAt(j))) { + ++j; + } + if (j - i >= 3) { + chars.replace(i, j, chars.charAt(i) + "-" + chars.charAt(j - 1)); + i += 2; + } else { + i = j; + } + } + } + + private static String findPostfix(ArrayList cs) { + final String first = cs.get(0); + final int num = cs.size(); + int l = 1, p = first.length() - 1, good = 0, level = 0; + outer: while (p >= 0) { + char c = first.charAt(p); + char prev = p > 0 ? first.charAt(p - 1) : 'X'; + for (int i = 1; i < num; i++) { + String cand = cs.get(i); + if (cand.length() < l || cand.charAt(cand.length() - l) != c) { + break outer; + } + } + if (prev != '\\' && (c == '[' || c == '(')) { + --level; + } + good = (level != 0 || !isSimple(c) || prev == '\\') ? good : l; + if (prev != '\\' && (c == ']' || c == ')')) { + ++level; + } + if (prev == '\\' && c == '\\') { + break; // Too complex, there could be more. + } + ++l; + --p; + } + return good > 0 ? first.substring(first.length() - good) : ""; + } + + private static int nextLength(String k, int p) throws OptimizerException { + int l = p; + assert (l < k.length()) : "Trying to access char " + l + " of: " + k; + char next = k.charAt(l); + if (next == '\\') { + if (k.length() == l) { + throw new OptimizerException("Trailing backslash? " + k); + } + ++l; + } + ++l; + while (l < k.length()) { + char next2 = k.charAt(l); + if (next2 == '?' || next2 == '*' || next2 == '+') { + ++l; + } else { + break; + } + } + return l; + } + + /** + * Find the length of a shared prefix. + * + * @param ps + * Data array + * @param start + * Subset begin + * @param end + * Subset end + * @param p + * Known prefix length + * @return New prefix length + */ + private static int prefixLength(String[] ps, int start, int end, int p) { + final String k = ps[start]; + if (p == k.length()) { + return p; + } + int good = p; + int inset = 0; + char prev = p > 0 ? k.charAt(p - 1) : 'X'; + char next = k.charAt(p); + common: while (p < k.length()) { + for (int i = start + 1; i < end; i++) { + String cand = ps[i]; + if (cand.length() < p || cand.charAt(p) != next) { + break common; + } + } + if (prev == '\\') { + prev = 'X'; + } else { + if (next == '[') { + ++inset; + } else if (next == ']') { + --inset; + } + prev = next; + } + ++p; + next = p < k.length() ? k.charAt(p) : 'X'; + good = (inset > 0 || prev == '\\' || next == '?' || next == '*' || next == '+') ? good : p; + } + return good; + } + + public static void main(String[] args) { + try { + String[] test = { // + // "1(?:st|\\.)? Advent", "first Advent", // + // "2(?:nd|\\.)? Advent", "second Advent", // + // "3(?:rd|\\.)? Advent", "third Advent", // + // "4(?:th|\\.)? Advent", "fourth Advent", // + // "Christmas(?: [Ee]ve| [Dd]ay)?", "Calennig", // + // "X-?(?:mas|MAS)", // + // "[0-9][0-9]?[0-9]?[0-9]?", // produces duplicates! + // "[Ss]ix", "[Ss]ixty", "[Ss]ixteen", // + // "[Ss]ixty[ -]?(?:one|two|three|four|five|six|seven|eight|nine)", // + // "1[0-9]", "1[0-9]th", // + //"[Hh]eilig(?:en?|) [Dd]rei König(?:en?|)", "[Hh]eilig(?:en|) Abend",// + "(?:[Aa](?:pril(?:is)?|ugusti?)|[Dd]e(?:cemb(?:er|r(?:\\.|is)?)|zember)|[Ff]ebruar(?:ii|y)?|[Hh]ornung|[Jj](?:anuar(?:ii|y)?|u(?:[ln](?:ii?|y))|änner)|[Mm](?:a(?:erz|ii?|r(?:ch|t(?:ii)?)|y)|[eä]rz)|[Nn]ovemb(?:er|r(?:\\.|is)?)|[Oo](?:ctob(?:er|r(?:\\.|is)?)|ktober)|[Ss]eptemb(?:er|r(?:\\.|is)?))", // + "(?:[Aa](?:pr(?:\\.)?|ug(?:\\.)?)|[Dd]e(?:[cz](?:\\.)?)|[Ff]eb(?:\\.)?|[Jj](?:an(?:\\.)?|u(?:[ln](?:\\.)?))|[Mm](?:a(?:[iry])|är(?:\\.)?)|[Nn]ov(?:\\.)?|[Oo][ck]t(?:\\.)?|[Ss]ep(?:\\.|t(?:\\.)?)?)", // + "(?:0[1-9]|1[0-2]?|[2-9])", // + }; + + ArrayList expanded = new ArrayList<>(); + for (String s : test) { + expandPatterns(s, x -> expanded.add(x.toString())); + } + // Note: this may still contain duplicates! + Collections.sort(expanded, upperLowerChar); + for (String s : expanded.subList(0, Math.min(expanded.size(), 100))) { + System.out.println(s); + } + if (expanded.size() >= 100) { + System.out.println("... and " + (expanded.size() - 100) + " more."); + } + System.out.println("---> converted to --->"); + String combined = combinePatterns(expanded); + System.out.println(combined); + } catch (OptimizerException e) { + LOG.error(e.getMessage(), e); + } + } +} \ No newline at end of file diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/utilities/Season.java b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/Season.java new file mode 100644 index 00000000..df4386f6 --- /dev/null +++ b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/Season.java @@ -0,0 +1,49 @@ +package de.unihd.dbs.uima.annotator.heideltime.utilities; + +public enum Season { + SPRING("SP", 0), SUMMER("SU", 1), FALL("FA", 2), WINTER("WI", 3); + protected String str; + protected int off; + + private Season(String s, int o) { + str = s; + off = o; + } + + @Override + public String toString() { + return str; + } + + /** + * Ordinal representation, spring = 0, summer = 1, fall = 2, winter = 3 + * + * @return Ordinal + */ + public int ord() { + return off; + } + + public static Season of(CharSequence s, int b) { + if (b + 1 >= s.length()) + return null; + char c1 = s.charAt(b), c2 = s.charAt(b + 1); + if (c1 == 'S') { + if (c2 == 'P') + return SPRING; + if (c2 == 'U') + return SUMMER; + } else if (c1 == 'F') { + if (c2 == 'A') + return FALL; + } else if (c1 == 'W') { + if (c2 == 'I') + return WINTER; + } + return null; + } + + public static Season of(CharSequence s) { + return of(s, 0); + } +} \ No newline at end of file diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/utilities/TokenBoundaryMatcher.java b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/TokenBoundaryMatcher.java new file mode 100644 index 00000000..e2e87f5d --- /dev/null +++ b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/TokenBoundaryMatcher.java @@ -0,0 +1,310 @@ +package de.unihd.dbs.uima.annotator.heideltime.utilities; + +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Matcher; + +import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.jcas.JCas; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import de.unihd.dbs.uima.types.heideltime.Sentence; +import de.unihd.dbs.uima.types.heideltime.Token; + +/** + * Class for more efficient matching. + * + * This class applies a regular expression only at known token boundary positions, rather than attempting to match at any character. + * + * @author Erich Schubert + */ +public class TokenBoundaryMatcher { + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(TokenBoundaryMatcher.class); + + /** Enable pattern profiling, to identify unusually slow patterns */ + private static final boolean PROFILE_REGEXP = false; + + /** Data storage for profiling */ + private HashMap profileData = PROFILE_REGEXP ? new HashMap() : null; + + /** Storage for valid starting positions */ + IntArrayList startpos = new IntArrayList(); + + /** Storage for valid end positions */ + IntArrayList endpos = new IntArrayList(); + + /** + * Simplify a string by doing some character substitutions. + * + * This intentionally does not change the length, to preserve offsets! + * + * @param in + * Input text + * @return Simplified text + */ + public static CharSequence simplifyString(CharSequence in) { + StringBuilder buf = new StringBuilder(in); + final int len = in.length(); + for (int i = 0; i < len; i++) { + char c = buf.charAt(i); + if (c == '\t' || c == '\u00A0' || (c >= '\u2000' && c <= '\u200D') || c == '\u202F' || c == '\u205F' || c == '\u2060' || c == '\u3000' || c == '\uFEFF') { + // Normalize whitespace (but leave \n\r) + buf.setCharAt(i, ' '); + } else if (c >= '\u2011' && c <= '\u2014') { + // Normalize unicode hyphens: + buf.setCharAt(i, '-'); + } else if (c == '\u000B' || c == '\u000C' || c == '\u0085' || c == '\u2028' || c == '\u2029') { + // Unusual line and paragraph breaks. + buf.setCharAt(i, '\n'); + } + // TODO: add double-width arabic digits? But they are 2 chars wide. + } + return buf; + } + + /** + * Build a list of admissible token boundaries, for faster matching. + * + * If a token begins or ends with [.,/-], this character will be an optional match. + * + * @param startpos + * Output valid starting positions + * @param endpos + * Output valid end positions + * @param s + * Sentence + * @param jcas + * JCas + * @param coveredText + * Text covered by sentence + */ + public void tokenBoundaries(CharSequence coveredText, Sentence s, JCas jcas) { + startpos.clear(); + endpos.clear(); + AnnotationIndex tokens = jcas.getAnnotationIndex(Token.type); + final int offset = s.getBegin(); + startpos.add(0); // s.getBegin() - offset + for (FSIterator iterToken = tokens.subiterator(s); iterToken.hasNext();) { + Token t = iterToken.next(); + int begin = t.getBegin() - offset, end = t.getEnd() - offset; + if (begin == end) + continue; + // Allow begin and end for anchoring: + if (checkBegin(coveredText, begin)) + startpos.add(begin); + // Note, begin < end! + char first = coveredText.charAt(begin); + if (first == '.' || first == ',' || first == '/' || first == '-') + if (checkBegin(coveredText, begin + 1)) + startpos.add(begin + 1); + int lastp = end - 1; + if (begin < lastp) { // Avoid checking one-char tokens twice. + char lastc = coveredText.charAt(lastp); + if (lastc == '.' || lastc == ',' || lastc == '/' || lastc == '-') + if (checkEnd(coveredText, lastp)) + endpos.add(lastp); + // Stanford produces tokens like "2016/2017". + if (isDigit(first) && isDigit(lastc)) { + int left = begin + 1, right = lastp - 1; + while(left < right && isDigit(coveredText.charAt(left))) + ++left; + while(left < right && isDigit(coveredText.charAt(right))) + --right; + if (left == right) { + char sep = coveredText.charAt(left); + if (sep == '/' /* || sep == '-' || sep == '.' || sep == ',' */) { + endpos.add(left); + startpos.add(left + 1); + } + } + } + } + if (checkEnd(coveredText, end)) + endpos.add(end); + } + endpos.add(s.getEnd() - offset); + startpos.sortRemoveDuplicates(); + endpos.sortRemoveDuplicates(); + if (LOG.isTraceEnabled()) + LOG.trace("Token boundaries: {}", debugTokenBoundaries(coveredText)); + } + + /** + * Produce a debug representation of the token boundaries. + * + * @param cov + * Covered text + * @param startpos + * Start positions + * @param endpos + * End positions + * @return String buffer + */ + public StringBuilder debugTokenBoundaries(CharSequence cov) { + final int l = cov.length(), sl = startpos.size(), el = endpos.size(); + assert (endpos.size() == 0 || endpos.get(0) > 0); + StringBuilder buf = new StringBuilder(l + sl + el); + for (int c = 0, s = 0, e = 0; c < l; c++) { + if (s < sl && c == startpos.get(s)) { + buf.append('»'); + ++s; + } + buf.append(cov.charAt(c)); + if (e < el && c + 1 == endpos.get(e)) { + buf.append('«'); + ++e; + } + } + return buf; + } + + /** + * Check the beginning of a token. Disallow 0123.2016 to match 2016. + * + * @param cov + * Text + * @param begin + * Position + * @return {@code false} if a bad position + */ + private static boolean checkBegin(CharSequence cov, int begin) { + final int len = cov.length(); + if (begin >= len) + return false; + // Position 0 is always allowed. + if (begin == 0) + return true; + // Check digits: + final char curr = cov.charAt(begin); + if (isDigit(curr)) { + // Check previous character: + final char prev = cov.charAt(begin - 1); + // get rid of expressions if there is a character or symbol ($+) directly in front of the expression + if (prev == '$' || prev == '€' || prev == '+' || Character.isAlphabetic(prev)) + return false; + if (begin == 1) + return true; + // Check two characters: [0-9]\. + if (prev == '.' && isDigit(cov.charAt(begin - 2))) + return false; + } else if (curr == '.' || curr == ',') { + if (begin + 1 < len && isDigit(cov.charAt(begin - 1)) && isDigit(cov.charAt(begin + 1))) + return false; // Looks like a decimal point + } + return true; + } + + /** + * Check the end position of a token. Disallow 2016.12345 to match 2016. + * + * @param cov + * Text + * @param end + * Position + * @return {@code false} if a bad position + */ + private static boolean checkEnd(CharSequence cov, int end) { + final int len = cov.length(); + if (end == 0 || end > len) + return false; // would be empty, or invalid + // End is always allowed. + if (end == len) + return true; + // Current character_ + final char curr = cov.charAt(end - 1); + if (isDigit(curr)) { + final char succ = cov.charAt(end); + if (succ == '%' || succ == '(' || succ == '°' || succ == '$' || succ == '€' || Character.isAlphabetic(succ)) + return false; + if (end + 1 < len && (succ == '.' || succ == ',')) + if (isDigit(cov.charAt(end + 1))) + return false; // Looks like a decimal point + } + return true; + } + + /** + * In contrast to {@link Character#isDigit}, we only allow ascii digits here. + * + * (The other digits require two chars!) + * + * @param c + * Character + * @return {@code true} if '0'-'9' + */ + public static boolean isDigit(char c) { + return (c >= '0' && c <= '9'); + } + + /** + * Find the next match beginning and ending at token boundaries. + * + * @param start + * Search position + * @param m + * Matcher + * @param key + * Key (for performance logging) + * @return Search position to continue, or {@code -1} if no match. + */ + public int matchNext(int start, Matcher m, String key) { + assert (start >= 0); + long begin = PROFILE_REGEXP ? System.nanoTime() : 0; + final int slen = startpos.size(); + if (start >= slen || endpos.size() == 0) + return -1; + final int epos = endpos.get(endpos.size() - 1); + while (start < slen) { + if (m.region(startpos.get(start), epos).lookingAt()) { + // Match. Ensure end matched a token boundary: + final int mend = m.end(); + if (endpos.binarySearch(mend) >= 0) { + // Good match. + // Find position to continue matching: + int etok = start + 1; + while (etok < slen && startpos.get(etok) < mend) + ++etok; + if (PROFILE_REGEXP) { + long dur = System.nanoTime() - begin; + Long old = profileData.get(key); + profileData.put(key, dur + (old != null ? old.longValue() : 0L)); + } + return etok; + } + } + ++start; + } + if (PROFILE_REGEXP) { + long dur = System.nanoTime() - begin; + Long old = profileData.get(key); + profileData.put(key, dur + (old != null ? old.longValue() : 0L)); + } + return -1; // No match + } + + /** + * Output profiling data, if enabled. + */ + public void logProfilingData() { + if (!PROFILE_REGEXP) + return; + long sum = 0; + for (Long v : profileData.values()) + sum += v; + double avg = sum / (double) profileData.size(); + + StringBuilder buf = new StringBuilder(); + buf.append("Profiling data:\n"); + buf.append("Average: ").append(avg).append("\n"); + buf.append("Rules with above average cost:\n"); + for (Map.Entry ent : profileData.entrySet()) { + long v = ent.getValue(); + if (v > 2 * avg) + buf.append(v).append('\t').append(ent.getKey()).append('\t').append(v / avg).append("\n"); + } + LOG.warn(buf.toString()); + } +} diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/utilities/Toolbox.java b/src/de/unihd/dbs/uima/annotator/heideltime/utilities/Toolbox.java deleted file mode 100644 index 96a60122..00000000 --- a/src/de/unihd/dbs/uima/annotator/heideltime/utilities/Toolbox.java +++ /dev/null @@ -1,61 +0,0 @@ -package de.unihd.dbs.uima.annotator.heideltime.utilities; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.List; -import java.util.regex.MatchResult; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -/** - * - * The Toolbox class contains methods with functionality that you would also - * find outside the context of HeidelTime's specific skillset; i.e. they do - * not require the CAS context, but are 'useful code snippets'. - * @author jannik stroetgen - * - */ -public class Toolbox { - /** - * Find all the matches of a pattern in a charSequence and return the - * results as list. - * - * @param pattern Pattern to be matched - * @param s String to be matched against - * @return Iterable List of MatchResults - */ - public static Iterable findMatches(Pattern pattern, CharSequence s) { - List results = new ArrayList(); - - for (Matcher m = pattern.matcher(s); m.find();) - results.add(m.toMatchResult()); - - return results; - } - - /** - * Sorts a given HashMap using a custom function - * @param m Map of items to sort - * @return sorted List of items - */ - public static List sortByValue(final HashMap m) { - List keys = new ArrayList(); - keys.addAll(m.keySet()); - Collections.sort(keys, new Comparator() { - @SuppressWarnings({ "unchecked", "rawtypes" }) - public int compare(Object o1, Object o2) { - Object v1 = m.get(o1); - Object v2 = m.get(o2); - if (v1 == null) { - return (v2 == null) ? 0 : 1; - } else if (v1 instanceof Comparable) { - return ((Comparable) v1).compareTo(v2); - } else { - return 0; - } - } - }); - return keys; - } -} diff --git a/src/de/unihd/dbs/uima/annotator/intervaltagger/IntervalTagger.java b/src/de/unihd/dbs/uima/annotator/intervaltagger/IntervalTagger.java index c806e6ca..3aa32de3 100644 --- a/src/de/unihd/dbs/uima/annotator/intervaltagger/IntervalTagger.java +++ b/src/de/unihd/dbs/uima/annotator/intervaltagger/IntervalTagger.java @@ -9,23 +9,28 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; -import java.util.regex.MatchResult; +import java.util.Locale; +import java.util.Map; +import java.util.TimeZone; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import de.unihd.dbs.uima.annotator.heideltime.resources.Language; import de.unihd.dbs.uima.annotator.heideltime.resources.RePatternManager; import de.unihd.dbs.uima.annotator.heideltime.resources.ResourceMap; import de.unihd.dbs.uima.annotator.heideltime.resources.ResourceScanner; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Toolbox; +import de.unihd.dbs.uima.annotator.heideltime.resources.RuleManager; import de.unihd.dbs.uima.types.heideltime.IntervalCandidateSentence; import de.unihd.dbs.uima.types.heideltime.Sentence; import de.unihd.dbs.uima.types.heideltime.Timex3; @@ -37,9 +42,10 @@ * */ public class IntervalTagger extends JCasAnnotator_ImplBase { + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(IntervalTagger.class); - // TOOL NAME (may be used as componentId) - private Class component = this.getClass(); + private static final TimeZone GMT = TimeZone.getTimeZone("GMT"); // descriptor parameter names public static String PARAM_LANGUAGE = "language"; @@ -47,8 +53,8 @@ public class IntervalTagger extends JCasAnnotator_ImplBase { public static String PARAM_INTERVAL_CANDIDATES = "annotate_interval_candidates"; // descriptor configuration private Language language = null; - private Boolean find_intervals = true; - private Boolean find_interval_candidates = true; + private boolean find_intervals = true; + private boolean find_interval_candidates = true; private HashMap hmIntervalPattern = new HashMap(); private HashMap hmIntervalNormalization = new HashMap(); @@ -83,89 +89,60 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { * @throws ResourceInitializationException */ private void readResources(ResourceMap hmResourcesRules) throws ResourceInitializationException { - Pattern paReadRules = Pattern.compile("RULENAME=\"(.*?)\",EXTRACTION=\"(.*?)\",NORM_VALUE=\"(.*?)\"(.*)"); - Pattern paVariable = Pattern.compile("%(re[a-zA-Z0-9]*)"); + Matcher maReadRules = Pattern.compile("RULENAME=\"(.*?)\",EXTRACTION=\"(.*?)\",NORM_VALUE=\"(.*?)\"(.*)").matcher(""); // read normalization data - InputStream is = null; - InputStreamReader isr = null; - BufferedReader br = null; - try { - for (String resource : hmResourcesRules.keySet()) { - is = hmResourcesRules.getInputStream(resource); - isr = new InputStreamReader(is, "UTF-8"); - br = new BufferedReader(isr); - Logger.printDetail(component, "Adding rule resource: " + resource); + for (String resource : hmResourcesRules.keySet()) { + if(!resource.equals("intervalrules")) + continue; + try (InputStream is = hmResourcesRules.getInputStream(resource);// + InputStreamReader isr = new InputStreamReader(is, "UTF-8");// + BufferedReader br = new BufferedReader(isr)) { + LOG.debug("Adding rule resource: {}", resource); for(String line; (line = br.readLine()) != null; ) { - if(line.startsWith("//") || line.equals("")) { + if(line.startsWith("//") || line.equals("")) continue; - } - Logger.printDetail("DEBUGGING: reading rules..."+ line); + LOG.debug("reading rules... {}", line); // check each line for the name, extraction, and normalization part - for (MatchResult r : Toolbox.findMatches(paReadRules, line)) { - String rule_name = r.group(1); - String rule_extraction = r.group(2); - String rule_normalization = r.group(3); + for (maReadRules.reset(line); maReadRules.find(); ) { + String rule_name = maReadRules.group(1); + String rule_extraction = maReadRules.group(2); + String rule_normalization = maReadRules.group(3); //////////////////////////////////////////////////////////////////// // RULE EXTRACTION PARTS ARE TRANSLATED INTO REGULAR EXPRESSSIONS // //////////////////////////////////////////////////////////////////// // create pattern for rule extraction part RePatternManager rpm = RePatternManager.getInstance(language, false); - for (MatchResult mr : Toolbox.findMatches(paVariable,rule_extraction)) { - Logger.printDetail("DEBUGGING: replacing patterns..."+ mr.group()); - if (!(rpm.containsKey(mr.group(1)))) { - Logger.printError("Error creating rule:"+rule_name); - Logger.printError("The following pattern used in this rule does not exist, does it? %"+mr.group(1)); - System.exit(-1); - } - rule_extraction = rule_extraction.replaceAll("%"+mr.group(1), rpm.get(mr.group(1))); - } - rule_extraction = rule_extraction.replaceAll(" ", "[\\\\s]+"); + rule_extraction = RuleManager.expandVariables(rule_name, rule_extraction, rpm); + rule_extraction = RuleManager.replaceSpaces(rule_extraction); Pattern pattern = null; try{ pattern = Pattern.compile(rule_extraction); } - catch (java.util.regex.PatternSyntaxException e) { - Logger.printError("Compiling rules resulted in errors."); - Logger.printError("Problematic rule is "+rule_name); - Logger.printError("Cannot compile pattern: "+rule_extraction); - e.printStackTrace(); - System.exit(-1); + catch (PatternSyntaxException e) { + LOG.error("Compiling rules resulted in errors.", e); + LOG.error("Problematic rule is: {}\nCannot compile pattern: {}", rule_name, rule_extraction); + System.exit(1); } ///////////////////////////////////////////////// // READ INTERVAL RULES AND MAKE THEM AVAILABLE // ///////////////////////////////////////////////// - if(resource.equals("intervalrules")){ - hmIntervalPattern.put(pattern,rule_name); - hmIntervalNormalization.put(rule_name, rule_normalization); - } + hmIntervalPattern.put(pattern,rule_name); + hmIntervalNormalization.put(rule_name, rule_normalization); } } - } - } catch (IOException e) { - e.printStackTrace(); - throw new ResourceInitializationException(); - } finally { - try { - if(br != null) { - br.close(); - } - if(isr != null) { - isr.close(); - } - if(is != null) { - is.close(); - } - } catch(Exception e) { - e.printStackTrace(); + } catch (IOException e) { + LOG.error(e.getMessage(), e); + throw new ResourceInitializationException(); } } } - + Pattern pNorm=Pattern.compile("group\\(([1-9]+)\\)-group\\(([1-9]+)\\)"); + /** * Extract Timex3Intervals, delimited by two Timex3Intervals in a sentence. * finsInterval needs to be run with jcas before. @@ -175,141 +152,134 @@ private void readResources(ResourceMap hmResourcesRules) throws ResourceInitiali private void findSentenceIntervals(JCas jcas){ HashSet timexesToRemove = new HashSet(); - FSIterator iterSentence = jcas.getAnnotationIndex(Sentence.type).iterator(); - while (iterSentence.hasNext()) { - Sentence s=(Sentence)iterSentence.next(); + AnnotationIndex sentences = jcas.getAnnotationIndex(Sentence.type); + for(Sentence s : sentences) { String sString=s.getCoveredText(); - FSIterator iterInter = jcas.getAnnotationIndex(Timex3Interval.type).subiterator(s); + AnnotationIndex intervals = jcas.getAnnotationIndex(Timex3Interval.type); + int count=0; List txes=new ArrayList(); - List sentenceTxes=new ArrayList(); - - while(iterInter.hasNext()){ + + for(FSIterator iterInter = intervals.subiterator(s); iterInter.hasNext(); ){ Timex3Interval t=(Timex3Interval)iterInter.next(); sString=sString.replace(t.getCoveredText(), ""); count++; txes.add(t); } - if(count>0){ + if(count == 0) + continue; - if (find_interval_candidates){ - IntervalCandidateSentence sI=new IntervalCandidateSentence(jcas); - sI.setBegin(s.getBegin()); - sI.setEnd(s.getEnd()); - sI.addToIndexes(); - } - for(Pattern p: hmIntervalPattern.keySet()){ - - String name=hmIntervalPattern.get(p); - Listresults=(List)Toolbox.findMatches(p,sString); - if(results.size()>0){ - //Interval in Sentence s found by Pattern p! - for(MatchResult r: results){ - Pattern pNorm=Pattern.compile("group\\(([1-9]+)\\)-group\\(([1-9]+)\\)"); - String norm=hmIntervalNormalization.get(name); - - Matcher mNorm=pNorm.matcher(norm); - if(!mNorm.matches()){ - System.err.println("Problem with the Norm in rule "+name); - } - Timex3Interval startTx=null,endTx=null; - try{ - int startId=Integer.parseInt(mNorm.group(1)); - int endId=Integer.parseInt(mNorm.group(2)); - - startTx=txes.get(Integer.parseInt(r.group(startId))); - endTx=txes.get(Integer.parseInt(r.group(endId))); - }catch(Exception e){ - e.printStackTrace(); - return; - } - Timex3Interval annotation=new Timex3Interval(jcas); - annotation.setBegin(startTx.getBegin()>endTx.getBegin()?endTx.getBegin():startTx.getBegin()); - annotation.setEnd(startTx.getEnd()>endTx.getEnd()?startTx.getEnd():endTx.getEnd()); - - //Does the interval already exist, - //found by another pattern? - boolean duplicate=false; - for(Timex3Interval tx:sentenceTxes){ - if(tx.getBegin()==annotation.getBegin() && - tx.getEnd()==annotation.getEnd()){ - duplicate=true; - break; - } - } + if (find_interval_candidates){ + IntervalCandidateSentence sI=new IntervalCandidateSentence(jcas); + sI.setBegin(s.getBegin()); + sI.setEnd(s.getEnd()); + sI.addToIndexes(); + } + List sentenceTxes=new ArrayList(); + for(Map.Entry ent : hmIntervalPattern.entrySet()){ + String name=ent.getValue(); + Matcher m = ent.getKey().matcher(sString); + while (m.find()) { + String norm=hmIntervalNormalization.get(name); - if(!duplicate){ - annotation.setTimexValueEB(startTx.getTimexValueEB()); - annotation.setTimexValueLB(startTx.getTimexValueLE()); - annotation.setTimexValueEE(endTx.getTimexValueEB()); - annotation.setTimexValueLE(endTx.getTimexValueLE()); - annotation.setTimexType(startTx.getTimexType()); - annotation.setFoundByRule(name); - - - // create emptyvalue value - String emptyValue = createEmptyValue(startTx, endTx, jcas); - annotation.setEmptyValue(emptyValue); - annotation.setBeginTimex(startTx.getBeginTimex()); - annotation.setEndTimex(endTx.getEndTimex()); - - try { - sentenceTxes.add(annotation); - } catch(NumberFormatException e) { - Logger.printError(component, "Couldn't do emptyValue calculation on accont of a faulty normalization in " - + annotation.getTimexValueEB() + " or " + annotation.getTimexValueEE()); - } - - // prepare tx3intervals to remove - timexesToRemove.add(startTx); - timexesToRemove.add(endTx); - - annotation.addToIndexes(); -// System.out.println(emptyValue); - } + Matcher mNorm=pNorm.matcher(norm); + if(!mNorm.matches()){ + LOG.warn("Problem with the Norm in rule "+name); + continue; + } + Timex3Interval startTx=null,endTx=null; + try{ + int startId=Integer.parseInt(mNorm.group(1)); + int endId=Integer.parseInt(mNorm.group(2)); + + startTx=txes.get(Integer.parseInt(m.group(startId))); + endTx=txes.get(Integer.parseInt(m.group(endId))); + }catch(Exception e){ + LOG.error(e.getMessage(), e); + return; + } + Timex3Interval annotation=new Timex3Interval(jcas); + annotation.setBegin(startTx.getBegin()>endTx.getBegin()?endTx.getBegin():startTx.getBegin()); + annotation.setEnd(startTx.getEnd()>endTx.getEnd()?startTx.getEnd():endTx.getEnd()); + + //Does the interval already exist, + //found by another pattern? + boolean duplicate=false; + for(Timex3Interval tx:sentenceTxes) + if(tx.getBegin()==annotation.getBegin() && + tx.getEnd()==annotation.getEnd()){ + duplicate=true; + break; } + if(duplicate) + continue; + + annotation.setTimexValueEB(startTx.getTimexValueEB()); + annotation.setTimexValueLB(startTx.getTimexValueLE()); + annotation.setTimexValueEE(endTx.getTimexValueEB()); + annotation.setTimexValueLE(endTx.getTimexValueLE()); + annotation.setTimexType(startTx.getTimexType()); + annotation.setFoundByRule(name); + + // create emptyvalue value + String emptyValue = createEmptyValue(startTx, endTx, jcas); + annotation.setEmptyValue(emptyValue); + annotation.setBeginTimex(startTx.getBeginTimex()); + annotation.setEndTimex(endTx.getEndTimex()); + + try { + sentenceTxes.add(annotation); + } catch(NumberFormatException e) { + LOG.error("Couldn't do emptyValue calculation on accont of a faulty normalization in {} or {}", + annotation.getTimexValueEB(), annotation.getTimexValueEE()); } + + // prepare tx3intervals to remove + timexesToRemove.add(startTx); + timexesToRemove.add(endTx); + + annotation.addToIndexes(); } } } - for(Timex3Interval txi : timexesToRemove) { + for(Timex3Interval txi : timexesToRemove) txi.removeFromIndexes(); - } } + Pattern datep = Pattern.compile("(\\d{1,4})?-?(\\d{2})?-?(\\d{2})?(T)?(\\d{2})?:?(\\d{2})?:?(\\d{2})?"); + // 1 2 3 4 5 6 7 + private String createEmptyValue(Timex3Interval startTx, Timex3Interval endTx, JCas jcas) throws NumberFormatException { String dateStr = "", timeStr = ""; // find granularity for start/end timex values - Pattern p = Pattern.compile("(\\d{1,4})?-?(\\d{2})?-?(\\d{2})?(T)?(\\d{2})?:?(\\d{2})?:?(\\d{2})?"); - // 1 2 3 4 5 6 7 - Matcher mStart = p.matcher(startTx.getTimexValue()); - Matcher mEnd = p.matcher(endTx.getTimexValue()); - Integer granularityStart = -1; - Integer granularityEnd = -2; - Integer granularity = -1; + Matcher mStart = datep.matcher(startTx.getTimexValue()); + Matcher mEnd = datep.matcher(endTx.getTimexValue()); + + if(!mStart.find() || !mEnd.find()) + return ""; // find the highest granularity in each timex - if(mStart.find() && mEnd.find()) { - for(Integer i = 1; i <= mStart.groupCount(); i++) { - if(mStart.group(i) != null) - granularityStart = i; - if(mEnd.group(i) != null) - granularityEnd = i; - } - } + int granularityStart = -1; + for(int i = 1; i <= mStart.groupCount(); i++) + if(mStart.group(i) != null) + granularityStart = i; + int granularityEnd = -2; + for(int i = 1; i <= mEnd.groupCount(); i++) + if(mEnd.group(i) != null) + granularityEnd = i; // if granularities aren't the same, we can't do anything here. - if(granularityEnd != granularityStart) { + if(granularityEnd != granularityStart) return ""; - } else { // otherwise, set maximum granularity - granularity = granularityStart; - } + + // otherwise, set maximum granularity + int granularity = granularityStart; // check all the different granularities, starting with seconds, calculate differences, add carries - Integer myYears = 0, + int myYears = 0, myMonths = 0, myDays = 0, myHours = 0, @@ -343,7 +313,7 @@ private String createEmptyValue(Timex3Interval startTx, Timex3Interval endTx, JC if(granularity >= 3 && mStart.group(3) != null && mEnd.group(3) != null) { myDays += Integer.parseInt(mEnd.group(3)) - Integer.parseInt(mStart.group(3)); if(myDays < 0) { - Calendar cal = Calendar.getInstance(); + Calendar cal = Calendar.getInstance(GMT, Locale.ROOT); cal.set(Calendar.YEAR, Integer.parseInt(mStart.group(1))); cal.set(Calendar.MONTH, Integer.parseInt(mStart.group(2))); @@ -402,6 +372,17 @@ private String createEmptyValue(Timex3Interval startTx, Timex3Interval endTx, JC return "P" + dateStr + (timeStr.length() > 0 ? "T" + timeStr : ""); } + //DATE Pattern + Pattern pDate = Pattern.compile("(?:BC)?(\\d\\d\\d\\d)(-(\\d+))?(-(\\d+))?(T(\\d+))?(:(\\d+))?(:(\\d+))?"); + Pattern pCentury = Pattern.compile("(\\d\\d)"); + Pattern pDecade = Pattern.compile("(\\d\\d\\d)"); + Pattern pQuarter = Pattern.compile("(\\d+)-Q([1-4])"); + Pattern pHalf = Pattern.compile("(\\d+)-H([1-2])"); + Pattern pSeason = Pattern.compile("(\\d+)-(SP|SU|FA|WI)"); + Pattern pWeek = Pattern.compile("(\\d+)-W(\\d+)"); + Pattern pWeekend = Pattern.compile("(\\d+)-W(\\d+)-WE"); + Pattern pTimeOfDay = Pattern.compile("(\\d+)-(\\d+)-(\\d+)T(AF|DT|MI|MO|EV|NI)"); + /** * Build Timex3Interval-Annotations out of Timex3Annotations in jcas. * @author Manuel Dewald @@ -410,63 +391,29 @@ private String createEmptyValue(Timex3Interval startTx, Timex3Interval endTx, JC private void findIntervals(JCas jcas) { ArrayList newAnnotations = new ArrayList(); - FSIterator iterTimex3 = jcas.getAnnotationIndex(Timex3.type).iterator(); - while (iterTimex3.hasNext()) { + Matcher mDate = pDate.matcher(""); + Matcher mCentury= pCentury.matcher(""); + Matcher mDecade = pDecade.matcher(""); + Matcher mQuarter= pQuarter.matcher(""); + Matcher mHalf = pHalf.matcher(""); + Matcher mSeason = pSeason.matcher(""); + Matcher mWeek = pWeek.matcher(""); + Matcher mWeekend= pWeekend.matcher(""); + Matcher mTimeOfDay= pTimeOfDay.matcher(""); + + AnnotationIndex timexes = jcas.getAnnotationIndex(Timex3.type); + for (Timex3 timex3 : timexes) { Timex3Interval annotation=new Timex3Interval(jcas); - Timex3 timex3 = (Timex3) iterTimex3.next(); - - //DATE Pattern - Pattern pDate = Pattern.compile("(?:BC)?(\\d\\d\\d\\d)(-(\\d+))?(-(\\d+))?(T(\\d+))?(:(\\d+))?(:(\\d+))?"); - Pattern pCentury = Pattern.compile("(\\d\\d)"); - Pattern pDecate = Pattern.compile("(\\d\\d\\d)"); - Pattern pQuarter = Pattern.compile("(\\d+)-Q([1-4])"); - Pattern pHalf = Pattern.compile("(\\d+)-H([1-2])"); - Pattern pSeason = Pattern.compile("(\\d+)-(SP|SU|FA|WI)"); - Pattern pWeek = Pattern.compile("(\\d+)-W(\\d+)"); - Pattern pWeekend = Pattern.compile("(\\d+)-W(\\d+)-WE"); - Pattern pTimeOfDay = Pattern.compile("(\\d+)-(\\d+)-(\\d+)T(AF|DT|MI|MO|EV|NI)"); - - Matcher mDate = pDate.matcher(timex3.getTimexValue()); - Matcher mCentury= pCentury.matcher(timex3.getTimexValue()); - Matcher mDecade = pDecate.matcher(timex3.getTimexValue()); - Matcher mQuarter= pQuarter.matcher(timex3.getTimexValue()); - Matcher mHalf = pHalf.matcher(timex3.getTimexValue()); - Matcher mSeason = pSeason.matcher(timex3.getTimexValue()); - Matcher mWeek = pWeek.matcher(timex3.getTimexValue()); - Matcher mWeekend= pWeekend.matcher(timex3.getTimexValue()); - Matcher mTimeOfDay= pTimeOfDay.matcher(timex3.getTimexValue()); + String timexValue = timex3.getTimexValue(); - boolean matchesDate=mDate.matches(); - boolean matchesCentury=mCentury.matches(); - boolean matchesDecade=mDecade.matches(); - boolean matchesQuarter=mQuarter.matches(); - boolean matchesHalf=mHalf.matches(); - boolean matchesSeason=mSeason.matches(); - boolean matchesWeek=mWeek.matches(); - boolean matchesWeekend=mWeekend.matches(); - boolean matchesTimeOfDay=mTimeOfDay.matches(); + String beginYear="UNDEF", endYear="UNDEF"; + String beginMonth="01", endMonth="12"; + String beginDay="01", endDay="31"; + String beginHour="00", endHour="23"; + String beginMinute="00", endMinute="59"; + String beginSecond="00", endSecond="59"; - String beginYear, endYear; - String beginMonth, endMonth; - String beginDay, endDay; - String beginHour, endHour; - String beginMinute, endMinute; - String beginSecond, endSecond; - - beginYear=endYear="UNDEF"; - beginMonth="01"; - endMonth="12"; - beginDay="01"; - endDay="31"; - beginHour="00"; - endHour="23"; - beginMinute="00"; - endMinute="59"; - beginSecond="00"; - endSecond="59"; - - if(matchesDate){ - + if(mDate.reset(timexValue).matches()){ //Get Year(1) beginYear=endYear=mDate.group(1); @@ -476,9 +423,9 @@ private void findIntervals(JCas jcas) { //Get Day(5) if(mDate.group(5)==null){ - Calendar c=Calendar.getInstance(); + Calendar c=Calendar.getInstance(GMT, Locale.ROOT); c.set(Integer.parseInt(beginYear), Integer.parseInt(beginMonth)-1, 1); - endDay=""+c.getActualMaximum(Calendar.DAY_OF_MONTH); + endDay=Integer.toString(+c.getActualMaximum(Calendar.DAY_OF_MONTH)); beginDay="01"; }else{ beginDay=endDay=mDate.group(5); @@ -501,29 +448,29 @@ private void findIntervals(JCas jcas) { } } - }else if(matchesCentury){ + }else if(mCentury.reset(timexValue).matches()){ beginYear=mCentury.group(1)+"00"; endYear=mCentury.group(1)+"99"; - }else if(matchesDecade){ + }else if(mDecade.reset(timexValue).matches()){ beginYear=mDecade.group(1)+"0"; endYear=mDecade.group(1)+"9"; - }else if(matchesQuarter){ + }else if(mQuarter.reset(timexValue).matches()){ beginYear=endYear=mQuarter.group(1); int beginMonthI=3*(Integer.parseInt(mQuarter.group(2))-1)+1; - beginMonth=""+beginMonthI; - endMonth=""+(beginMonthI+2); - Calendar c=Calendar.getInstance(); + beginMonth=Integer.toString(beginMonthI); + endMonth=Integer.toString(beginMonthI+2); + Calendar c=Calendar.getInstance(GMT, Locale.ROOT); c.set(Integer.parseInt(beginYear), Integer.parseInt(endMonth)-1, 1); - endDay=""+c.getActualMaximum(Calendar.DAY_OF_MONTH); - }else if(matchesHalf){ + endDay=Integer.toString(+c.getActualMaximum(Calendar.DAY_OF_MONTH)); + }else if(mHalf.reset(timexValue).matches()){ beginYear=endYear=mHalf.group(1); int beginMonthI=6*(Integer.parseInt(mHalf.group(2))-1)+1; - beginMonth=""+beginMonthI; - endMonth=""+(beginMonthI+5); - Calendar c=Calendar.getInstance(); + beginMonth=Integer.toString(beginMonthI); + endMonth=Integer.toString(beginMonthI+5); + Calendar c=Calendar.getInstance(GMT, Locale.ROOT); c.set(Integer.parseInt(beginYear), Integer.parseInt(endMonth)-1, 1); - endDay=""+c.getActualMaximum(Calendar.DAY_OF_MONTH); - }else if(matchesSeason){ + endDay=Integer.toString(+c.getActualMaximum(Calendar.DAY_OF_MONTH)); + }else if(mSeason.reset(timexValue).matches()){ beginYear=mSeason.group(1); endYear=beginYear; if(mSeason.group(2).equals("SP")){ @@ -542,37 +489,37 @@ private void findIntervals(JCas jcas) { endMonth="12"; endDay="21"; }else if(mSeason.group(2).equals("WI")){ - endYear=""+(Integer.parseInt(beginYear)+1); + endYear=Integer.toString(Integer.parseInt(beginYear)+1); beginMonth="12"; beginDay="22"; endMonth="03"; endDay="20"; } - }else if(matchesWeek){ + }else if(mWeek.reset(timexValue).matches()){ beginYear=endYear=mWeek.group(1); - Calendar c=Calendar.getInstance(); + Calendar c=Calendar.getInstance(GMT, Locale.ROOT); c.setFirstDayOfWeek(Calendar.MONDAY); c.set(Calendar.YEAR,Integer.parseInt(beginYear)); c.set(Calendar.WEEK_OF_YEAR, Integer.parseInt(mWeek.group(2))); c.set(Calendar.DAY_OF_WEEK, Calendar.MONDAY); - beginDay=""+c.get(Calendar.DAY_OF_MONTH); - beginMonth=""+(c.get(Calendar.MONTH)+1); + beginDay=Integer.toString(+c.get(Calendar.DAY_OF_MONTH)); + beginMonth=Integer.toString(c.get(Calendar.MONTH)+1); c.set(Calendar.DAY_OF_WEEK, Calendar.SUNDAY); - endDay=""+(c.get(Calendar.DAY_OF_MONTH)); - endMonth=""+(c.get(Calendar.MONTH)+1); - }else if(matchesWeekend){ + endDay=Integer.toString(c.get(Calendar.DAY_OF_MONTH)); + endMonth=Integer.toString(c.get(Calendar.MONTH)+1); + }else if(mWeekend.reset(timexValue).matches()){ beginYear=endYear=mWeekend.group(1); - Calendar c=Calendar.getInstance(); + Calendar c=Calendar.getInstance(GMT, Locale.ROOT); c.setFirstDayOfWeek(Calendar.MONDAY); c.set(Calendar.YEAR,Integer.parseInt(beginYear)); c.set(Calendar.WEEK_OF_YEAR, Integer.parseInt(mWeekend.group(2))); c.set(Calendar.DAY_OF_WEEK, Calendar.SATURDAY); - beginDay=""+c.get(Calendar.DAY_OF_MONTH); - beginMonth=""+(c.get(Calendar.MONTH)+1); + beginDay=Integer.toString(+c.get(Calendar.DAY_OF_MONTH)); + beginMonth=Integer.toString(c.get(Calendar.MONTH)+1); c.set(Calendar.DAY_OF_WEEK, Calendar.SUNDAY); - endDay=""+(c.get(Calendar.DAY_OF_MONTH)); - endMonth=""+(c.get(Calendar.MONTH)+1); - }else if(matchesTimeOfDay){ + endDay=Integer.toString(c.get(Calendar.DAY_OF_MONTH)); + endMonth=Integer.toString(c.get(Calendar.MONTH)+1); + }else if(mTimeOfDay.reset(timexValue).matches()){ beginYear=endYear=mTimeOfDay.group(1); beginMonth=endMonth=mTimeOfDay.group(2); beginDay=endDay=mTimeOfDay.group(3); @@ -607,7 +554,7 @@ private void findIntervals(JCas jcas) { annotation.setTimexMod(timex3.getTimexMod()); annotation.setTimexQuant(timex3.getTimexMod()); annotation.setTimexType(timex3.getTimexType()); - annotation.setTimexValue(timex3.getTimexValue()); + annotation.setTimexValue(timexValue); annotation.setSentId(timex3.getSentId()); annotation.setBegin(timex3.getBegin()); annotation.setFoundByRule(timex3.getFoundByRule()); diff --git a/src/de/unihd/dbs/uima/annotator/jvntextprowrapper/JVnTextProWrapper.java b/src/de/unihd/dbs/uima/annotator/jvntextprowrapper/JVnTextProWrapper.java index afa0d419..d65c0fab 100644 --- a/src/de/unihd/dbs/uima/annotator/jvntextprowrapper/JVnTextProWrapper.java +++ b/src/de/unihd/dbs/uima/annotator/jvntextprowrapper/JVnTextProWrapper.java @@ -25,8 +25,9 @@ import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.jcas.JCas; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; import de.unihd.dbs.uima.types.heideltime.Sentence; import de.unihd.dbs.uima.types.heideltime.Token; @@ -35,7 +36,8 @@ * */ public class JVnTextProWrapper extends JCasAnnotator_ImplBase { - private Class component = this.getClass(); + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(JVnTextProWrapper.class); // definitions of what names these parameters have in the wrapper's descriptor file public static final String PARAM_SENTSEGMODEL_PATH = "sent_model_path"; @@ -74,7 +76,7 @@ public void initialize(UimaContext aContext) { if(sentModelPath != null) if(!vnSenSegmenter.init(sentModelPath)) { - Logger.printError(component, "Error initializing the sentence segmenter model: " + sentModelPath); + LOG.error("Error initializing the sentence segmenter model: " + sentModelPath); System.exit(-1); } @@ -82,7 +84,7 @@ public void initialize(UimaContext aContext) { try { vnSegmenter.init(wordModelPath); } catch(Exception e) { - Logger.printError(component, "Error initializing the word segmenter model: " + wordModelPath); + LOG.error("Error initializing the word segmenter model: " + wordModelPath); System.exit(-1); } @@ -91,7 +93,7 @@ public void initialize(UimaContext aContext) { dataTagger.addContextGenerator(new POSContextGenerator(posModelPath + File.separator + "featuretemplate.xml")); classifier = new Classification(posModelPath); } catch(Exception e) { - Logger.printError(component, "Error initializing the POS tagging model: " + posModelPath); + LOG.error("Error initializing the POS tagging model: " + posModelPath); System.exit(-1); } } diff --git a/src/de/unihd/dbs/uima/annotator/stanfordtagger/StanfordPOSTaggerWrapper.java b/src/de/unihd/dbs/uima/annotator/stanfordtagger/StanfordPOSTaggerWrapper.java index 13ec20e7..cdf89ce7 100644 --- a/src/de/unihd/dbs/uima/annotator/stanfordtagger/StanfordPOSTaggerWrapper.java +++ b/src/de/unihd/dbs/uima/annotator/stanfordtagger/StanfordPOSTaggerWrapper.java @@ -16,8 +16,11 @@ import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.FSIterator; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; import de.unihd.dbs.uima.types.heideltime.Sentence; import de.unihd.dbs.uima.types.heideltime.Token; @@ -34,7 +37,8 @@ * */ public class StanfordPOSTaggerWrapper extends JCasAnnotator_ImplBase { - private Class component = this.getClass(); + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(StanfordPOSTaggerWrapper.class); // definitions of what names these parameters have in the wrapper's descriptor file public static final String PARAM_MODEL_PATH = "model_path"; @@ -66,7 +70,7 @@ public void initialize(UimaContext aContext) { // check if the model file exists if(model_path == null) { - Logger.printError(component, "The model file for the Stanford Tagger was not correctly specified."); + LOG.error("The model file for the Stanford Tagger was not correctly specified."); System.exit(-1); } @@ -81,8 +85,7 @@ public void initialize(UimaContext aContext) { mt = new MaxentTagger(model_path, new TaggerConfig("-model", model_path), false); } } catch(Exception e) { - e.printStackTrace(); - Logger.printError(component, "MaxentTagger could not be instantiated with the supplied model("+model_path+") and config("+config_path+") file."); + LOG.error("MaxentTagger could not be instantiated with the supplied model("+model_path+") and config("+config_path+") file.", e); System.exit(-1); } } @@ -91,7 +94,7 @@ public void initialize(UimaContext aContext) { * Method that gets called to process the documents' cas objects */ public void process(JCas jcas) throws AnalysisEngineProcessException { - Integer offset = 0; // a cursor of sorts to keep up with the position in the document text + int offset = 0; // a cursor of sorts to keep up with the position in the document text // grab the document text String docText = jcas.getDocumentText(); @@ -100,6 +103,7 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { fac.setOptions("ptb3Escaping=false,untokenizable=noneKeep"); List> tokenArray = MaxentTagger.tokenizeText(new StringReader(docText), fac); + int sentences = 0; // iterate over sentences in this document for(List sentenceToken : tokenArray) { List taggedSentence = mt.tagSentence(sentenceToken); @@ -107,9 +111,10 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { // create a sentence object. gets added to index or discarded depending on configuration Sentence sentence = new Sentence(jcas); + sentence.setSentenceId(++sentences); sentence.setBegin(offset); - Integer wordCount = 0; + int wordCount = 0; // iterate over words in this sentence for(HasWord wordToken : sentenceToken) { Token t = new Token(jcas); @@ -123,8 +128,8 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { String thisWord = wordToken.word(); if(docText.indexOf(thisWord, offset) < 0) { - Logger.printDetail(component, "A previously tagged token wasn't found in the document text: \"" + thisWord + "\". " + - "This may be due to unpredictable punctuation tokenization; hence this token isn't tagged."); + LOG.debug("A previously tagged token wasn't found in the document text: \"{}\". " + + "This may be due to unpredictable punctuation tokenization; hence this token isn't tagged.", thisWord); continue; // jump to next token: discards token } else { offset = docText.indexOf(thisWord, offset); // set cursor to the starting position of token in docText @@ -152,7 +157,7 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { } // TODO: DEBUG - FSIterator fsi = jcas.getAnnotationIndex(Sentence.type).iterator(); + FSIterator fsi = jcas.getAnnotationIndex(Sentence.type).iterator(); while(fsi.hasNext()) { Sentence s = (Sentence) fsi.next(); if(s.getBegin() < 0 || s.getEnd() < 0) { @@ -161,7 +166,7 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { System.exit(-1); } } - FSIterator fsi2 = jcas.getAnnotationIndex(Token.type).iterator(); + FSIterator fsi2 = jcas.getAnnotationIndex(Token.type).iterator(); while(fsi2.hasNext()) { Token t = (Token) fsi2.next(); if(t.getBegin() < 0 || t.getEnd() < 0) { diff --git a/src/de/unihd/dbs/uima/annotator/treetagger/TreeTaggerReader.java b/src/de/unihd/dbs/uima/annotator/treetagger/TreeTaggerReader.java index 371516f2..5059757b 100644 --- a/src/de/unihd/dbs/uima/annotator/treetagger/TreeTaggerReader.java +++ b/src/de/unihd/dbs/uima/annotator/treetagger/TreeTaggerReader.java @@ -15,7 +15,7 @@ public class TreeTaggerReader implements Runnable { private List tokens; private BufferedReader reader; private JCas jcas; - private Boolean annotate_sentences; + private boolean annotate_sentences; private int i; // position in list @@ -32,7 +32,7 @@ public class TreeTaggerReader implements Runnable { }) ); - public TreeTaggerReader(List tokens, BufferedReader reader, JCas jcas, Boolean annotate_sentences) { + public TreeTaggerReader(List tokens, BufferedReader reader, JCas jcas, boolean annotate_sentences) { this.tokens = tokens; this.reader = reader; this.jcas = jcas; @@ -43,7 +43,7 @@ public TreeTaggerReader(List tokens, BufferedReader reader, JCas jcas, Bo public void run() { i = 0; try { - Boolean isStarted = false; + boolean isStarted = false; Sentence sentence = null; String s = null; // wait for the starting token to arrive @@ -53,6 +53,7 @@ public void run() { break; } } + int sentences = 0; // iterate over all the output lines and tokens array (which have the same source and are hence symmetric) while(null != (s = reader.readLine()) && isStarted) { @@ -84,6 +85,7 @@ public void run() { // Finish current sentence if end-of-sentence pos was found or document ended sentence.setEnd(token.getEnd()); if(sentence.getBegin() < sentence.getEnd()){ + sentence.setSentenceId(++sentences); sentence.addToIndexes(); } @@ -112,6 +114,7 @@ public void run() { // Finish current sentence if end-of-sentence pos was found or document ended if(hsEndOfSentenceTag.contains(pos) || i == tokens.size()) { sentence.setEnd(token.getEnd()); + sentence.setSentenceId(++sentences); sentence.addToIndexes(); // Make sure current sentence is not active anymore so that a new one might be created @@ -125,6 +128,7 @@ public void run() { if(sentence != null) { sentence.setEnd(tokens.get(tokens.size() - 1).getEnd()); + sentence.setSentenceId(++sentences); sentence.addToIndexes(); } diff --git a/src/de/unihd/dbs/uima/annotator/treetagger/TreeTaggerTokenizer.java b/src/de/unihd/dbs/uima/annotator/treetagger/TreeTaggerTokenizer.java index ccf7b18a..0036c4e9 100644 --- a/src/de/unihd/dbs/uima/annotator/treetagger/TreeTaggerTokenizer.java +++ b/src/de/unihd/dbs/uima/annotator/treetagger/TreeTaggerTokenizer.java @@ -14,7 +14,8 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * @@ -24,6 +25,9 @@ * */ public class TreeTaggerTokenizer { + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(TreeTaggerTokenizer.class); + public static enum Flag { ENGLISH, FRENCH, ITALIAN, GALICIAN, Z; @@ -65,7 +69,7 @@ public TreeTaggerTokenizer(String abbreviationsFile, EnumSet flags) throws this.abbreviationsFile = new File(abbreviationsFile); if(!this.abbreviationsFile.exists() || !this.abbreviationsFile.canRead()) { - Logger.printError(this.getClass(), "Couldn't read abbreviations file " + abbreviationsFile + + LOG.error("Couldn't read abbreviations file " + abbreviationsFile + " (exist:" + this.abbreviationsFile.exists() + ",read:" + this.abbreviationsFile.canRead() + ")"); throw new RuntimeException(); } diff --git a/src/de/unihd/dbs/uima/annotator/treetagger/TreeTaggerWrapper.java b/src/de/unihd/dbs/uima/annotator/treetagger/TreeTaggerWrapper.java index 44d4f521..998a6ce1 100644 --- a/src/de/unihd/dbs/uima/annotator/treetagger/TreeTaggerWrapper.java +++ b/src/de/unihd/dbs/uima/annotator/treetagger/TreeTaggerWrapper.java @@ -25,23 +25,26 @@ import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.impl.RootUimaContext_impl; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ConfigurationManager; import org.apache.uima.resource.impl.ConfigurationManager_impl; import org.apache.uima.resource.impl.ResourceManager_impl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import de.unihd.dbs.uima.annotator.heideltime.resources.Language; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; +import de.unihd.dbs.uima.annotator.treetagger.TreeTaggerTokenizer.Flag; import de.unihd.dbs.uima.types.heideltime.Sentence; import de.unihd.dbs.uima.types.heideltime.Token; -import de.unihd.dbs.uima.annotator.treetagger.TreeTaggerTokenizer.Flag; /** * @author Andreas Fay, Julian Zell * */ public class TreeTaggerWrapper extends JCasAnnotator_ImplBase { - private Class component = this.getClass(); - + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(TreeTaggerWrapper.class); + // definitions of what names these parameters have in the wrapper's descriptor file public static final String PARAM_LANGUAGE = "language"; public static final String PARAM_ANNOTATE_TOKENS = "annotate_tokens"; @@ -49,42 +52,39 @@ public class TreeTaggerWrapper extends JCasAnnotator_ImplBase { public static final String PARAM_ANNOTATE_PARTOFSPEECH = "annotate_partofspeech"; public static final String PARAM_IMPROVE_GERMAN_SENTENCES = "improvegermansentences"; public static final String PARAM_CHINESE_TOKENIZER_PATH = "ChineseTokenizerPath"; - + // language for this instance of the treetaggerwrapper private Language language; - + // switches for annotation parameters private Boolean annotate_tokens = false; private Boolean annotate_sentences = false; private Boolean annotate_partofspeech = false; - + // local treetagger properties container, see below private TreeTaggerProperties ttprops = new TreeTaggerProperties(); private TreeTaggerProcess ttProc = null; - + // processing threads for I/O private TreeTaggerWriter ttwriter; private TreeTaggerReader ttreader; - + /** - * uimacontext to make secondary initialize() method possible. - * -> programmatic, non-uima pipeline usage. + * uimacontext to make secondary initialize() method possible. -> programmatic, non-uima pipeline usage. + * * @author julian * */ private class TreeTaggerContext extends RootUimaContext_impl { private ConfigurationManager mConfigManager; - + // shorthand for when we don't want to supply a cnTokPath @SuppressWarnings("unused") - public TreeTaggerContext(Language language, Boolean annotateTokens, Boolean annotateSentences, - Boolean annotatePartOfSpeech, Boolean improveGermanSentences) { - this(language, annotateTokens, annotateSentences, annotatePartOfSpeech, - improveGermanSentences, null); + public TreeTaggerContext(Language language, Boolean annotateTokens, Boolean annotateSentences, Boolean annotatePartOfSpeech, Boolean improveGermanSentences) { + this(language, annotateTokens, annotateSentences, annotatePartOfSpeech, improveGermanSentences, null); } - - public TreeTaggerContext(Language language, Boolean annotateTokens, Boolean annotateSentences, - Boolean annotatePartOfSpeech, Boolean improveGermanSentences, String cnTokPath) { + + public TreeTaggerContext(Language language, Boolean annotateTokens, Boolean annotateSentences, Boolean annotatePartOfSpeech, Boolean improveGermanSentences, String cnTokPath) { super(); // Initialize config @@ -95,7 +95,7 @@ public TreeTaggerContext(Language language, Boolean annotateTokens, Boolean anno // Set session mConfigManager.setSession(this.getSession()); - + // Set necessary variables mConfigManager.setConfigParameterValue(makeQualifiedName(PARAM_LANGUAGE), language.getName()); mConfigManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_TOKENS), annotateTokens); @@ -103,196 +103,194 @@ public TreeTaggerContext(Language language, Boolean annotateTokens, Boolean anno mConfigManager.setConfigParameterValue(makeQualifiedName(PARAM_ANNOTATE_SENTENCES), annotateSentences); mConfigManager.setConfigParameterValue(makeQualifiedName(PARAM_CHINESE_TOKENIZER_PATH), cnTokPath); } - + @Override public ConfigurationManager getConfigurationManager() { return mConfigManager; } } - + /** - * secondary initialize() to use wrapper outside of a uima pipeline - * shorthand for when we don't want to specify a cnTokPath + * secondary initialize() to use wrapper outside of a uima pipeline shorthand for when we don't want to specify a cnTokPath */ - public void initialize(Language language, String treeTaggerHome, Boolean annotateTokens, - Boolean annotateSentences, Boolean annotatePartOfSpeech, Boolean improveGermanSentences) { - this.initialize(language, treeTaggerHome, annotateTokens, annotateSentences, annotatePartOfSpeech, - improveGermanSentences, null); + public void initialize(Language language, String treeTaggerHome, Boolean annotateTokens, Boolean annotateSentences, Boolean annotatePartOfSpeech, Boolean improveGermanSentences) { + this.initialize(language, treeTaggerHome, annotateTokens, annotateSentences, annotatePartOfSpeech, improveGermanSentences, null); } - + /** * secondary initialize() to use wrapper outside of a uima pipeline * - * @param language Language/parameter file to use for the TreeTagger - * @param treeTaggerHome Path to the TreeTagger folder - * @param annotateTokens Whether to annotate tokens - * @param annotateSentences Whether to annotate sentences - * @param annotatePartOfSpeech Whether to annotate POS tags - * @param improveGermanSentences Whether to do improvements for german sentences + * @param language + * Language/parameter file to use for the TreeTagger + * @param treeTaggerHome + * Path to the TreeTagger folder + * @param annotateTokens + * Whether to annotate tokens + * @param annotateSentences + * Whether to annotate sentences + * @param annotatePartOfSpeech + * Whether to annotate POS tags + * @param improveGermanSentences + * Whether to do improvements for german sentences */ - public void initialize(Language language, String treeTaggerHome, Boolean annotateTokens, - Boolean annotateSentences, Boolean annotatePartOfSpeech, Boolean improveGermanSentences, String cnTokPath) { + public void initialize(Language language, String treeTaggerHome, Boolean annotateTokens, Boolean annotateSentences, Boolean annotatePartOfSpeech, Boolean improveGermanSentences, + String cnTokPath) { this.setHome(treeTaggerHome); - - TreeTaggerContext ttContext = new TreeTaggerContext(language, annotateTokens, - annotateSentences, annotatePartOfSpeech, improveGermanSentences, cnTokPath); - - this.initialize(ttContext); + + TreeTaggerContext ttContext = new TreeTaggerContext(language, annotateTokens, annotateSentences, annotatePartOfSpeech, improveGermanSentences, cnTokPath); + + this.initialize(ttContext); } - + /** * initialization method where we fill configuration values and check some prerequisites */ public void initialize(UimaContext aContext) { // check if the supplied language is one that we can currently handle this.language = Language.getLanguageFromString((String) aContext.getConfigParameterValue(PARAM_LANGUAGE)); - + // get configuration from the descriptor annotate_tokens = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_TOKENS); annotate_sentences = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_SENTENCES); annotate_partofspeech = (Boolean) aContext.getConfigParameterValue(PARAM_ANNOTATE_PARTOFSPEECH); String cnTokPath = (String) aContext.getConfigParameterValue(PARAM_CHINESE_TOKENIZER_PATH); - + // set some configuration based upon these values ttprops.languageName = language.getTreeTaggerLangName(); - if(ttprops.rootPath == null) + if (ttprops.rootPath == null) ttprops.rootPath = System.getenv("TREETAGGER_HOME"); ttprops.tokScriptName = "utf8-tokenize.perl"; - + // parameter file - if(!(new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.languageName + "-utf8.par").exists())) // get UTF8 version if it exists + if (!(new File(ttprops.rootPath + ttprops.fileSeparator + "lib", ttprops.languageName + "-utf8.par").exists())) // get UTF8 version if it exists ttprops.parFileName = ttprops.languageName + ".par"; else ttprops.parFileName = ttprops.languageName + "-utf8.par"; - + // abbreviation file - if(new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.languageName + "-abbreviations-utf8").exists()) { // get UTF8 version if it exists + if (new File(ttprops.rootPath + ttprops.fileSeparator + "lib", ttprops.languageName + "-abbreviations-utf8").exists()) // get UTF8 version if it exists ttprops.abbFileName = ttprops.languageName + "-abbreviations-utf8"; - } else { + else ttprops.abbFileName = ttprops.languageName + "-abbreviations"; - } - + ttprops.languageSwitch = language.getTreeTaggerSwitch(); - if(cnTokPath != null && !cnTokPath.equals("")) + if (cnTokPath != null && !cnTokPath.equals("")) ttprops.chineseTokenizerPath = new File(cnTokPath); else ttprops.chineseTokenizerPath = new File(ttprops.rootPath, "cmd"); - + // handle the treetagger path from the environment variables - if(ttprops.rootPath == null) { - Logger.printError("TreeTagger environment variable is not present, aborting."); + if (ttprops.rootPath == null) { + LOG.error("TreeTagger environment variable is not present, aborting."); System.exit(-1); } // Check for whether the required treetagger parameter files are present - Boolean abbFileFlag = true; - Boolean parFileFlag = true; - Boolean tokScriptFlag = true; - File abbFile = new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.abbFileName); - File parFile = new File(ttprops.rootPath+ttprops.fileSeparator+"lib", ttprops.parFileName); - File tokFile = new File(ttprops.rootPath+ttprops.fileSeparator+"cmd", ttprops.tokScriptName); + boolean abbFileFlag = true; + boolean parFileFlag = true; + boolean tokScriptFlag = true; + File abbFile = new File(ttprops.rootPath + ttprops.fileSeparator + "lib", ttprops.abbFileName); + File parFile = new File(ttprops.rootPath + ttprops.fileSeparator + "lib", ttprops.parFileName); + File tokFile = new File(ttprops.rootPath + ttprops.fileSeparator + "cmd", ttprops.tokScriptName); if (!(abbFileFlag = abbFile.exists())) { - if(language.equals(Language.CHINESE) || language.equals(Language.RUSSIAN)) { + if (language.equals(Language.CHINESE) || language.equals(Language.RUSSIAN)) { abbFileFlag = true; ttprops.abbFileName = null; - } else { - Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.abbFileName); - } - } - if (!(parFileFlag = parFile.exists())) { - Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.parFileName); + } else + LOG.error("File missing to use TreeTagger tokenizer: {}", ttprops.abbFileName); } + if (!(parFileFlag = parFile.exists())) + LOG.error("File missing to use TreeTagger tokenizer: {}", ttprops.parFileName); if (!(tokScriptFlag = tokFile.exists())) { - if(language.equals(Language.CHINESE)) + if (language.equals(Language.CHINESE)) tokScriptFlag = true; else - Logger.printError(component, "File missing to use TreeTagger tokenizer: " + ttprops.tokScriptName); + LOG.error("File missing to use TreeTagger tokenizer: {}", ttprops.tokScriptName); } if (!abbFileFlag || !parFileFlag || !tokScriptFlag) { - Logger.printError(component, "Cannot find tree tagger (" + ttprops.rootPath + ttprops.fileSeparator - + "cmd" + ttprops.fileSeparator + ttprops.tokScriptName + ")." + - " Make sure that path to tree tagger is set correctly in config.props!"); - Logger.printError(component, "If path is set correctly:"); - Logger.printError(component, "Maybe you need to download the TreeTagger tagger-scripts.tar.gz"); - Logger.printError(component, "from http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz"); - Logger.printError(component, "Extract this file and copy the missing file into the corresponding TreeTagger directories."); - Logger.printError(component, "If missing, copy " + ttprops.abbFileName + " into " + ttprops.rootPath+ttprops.fileSeparator+"lib"); - Logger.printError(component, "If missing, copy " + ttprops.parFileName + " into " + ttprops.rootPath+ttprops.fileSeparator+"lib"); - Logger.printError(component, "If missing, copy " + ttprops.tokScriptName + " into " + ttprops.rootPath+ttprops.fileSeparator+"cmd"); + LOG.error("Cannot find tree tagger (" + ttprops.rootPath + ttprops.fileSeparator + "cmd" + ttprops.fileSeparator + ttprops.tokScriptName + ")." + + " Make sure that path to tree tagger is set correctly in config.props!" + "\n" + "If path is set correctly:" + "\n" + + "Maybe you need to download the TreeTagger tagger-scripts.tar.gz" + "\n" + + "from http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz" + "\n" + + "Extract this file and copy the missing file into the corresponding TreeTagger directories." + "\n" + "If missing, copy " + ttprops.abbFileName + " into " + + ttprops.rootPath + ttprops.fileSeparator + "lib" + "\n" + "If missing, copy " + ttprops.parFileName + " into " + ttprops.rootPath + ttprops.fileSeparator + + "lib" + "\n" + "If missing, copy " + ttprops.tokScriptName + " into " + ttprops.rootPath + ttprops.fileSeparator + "cmd"); System.exit(-1); } } - + /** * Method that gets called to process the documents' cas objects */ public void process(JCas jcas) throws AnalysisEngineProcessException { // if the annotate_tokens flag is set, annotate the tokens and add them to the jcas - if(annotate_tokens) - if(language.equals(Language.CHINESE)) + if (annotate_tokens) + if (language.equals(Language.CHINESE)) tokenizeChinese(jcas); // chinese needs different tokenization else tokenize(jcas); - /* if the annotate_partofspeech flag is set, annotate partofspeech and, - * if specified, also tag sentences based upon the partofspeech tags. + /* + * if the annotate_partofspeech flag is set, annotate partofspeech and, if specified, also tag sentences based upon the partofspeech tags. */ - if(annotate_partofspeech) + if (annotate_partofspeech) doTreeTag(jcas); - + // if the improve_german_sentences flag is set, improve the sentence tokens made by the treetagger - if(this.language == Language.GERMAN) + if (this.language == Language.GERMAN) { improveGermanSentences(jcas); - + renumberSentences(jcas); + } + // if French, improve the sentence tokens made by the TreeTagger with settings for French - if (this.language == Language.FRENCH) + if (this.language == Language.FRENCH) { improveFrenchSentences(jcas); - + renumberSentences(jcas); + } + } - + /** - * tokenizes a given JCas object's document text using the treetagger program - * and adds the recognized tokens to the JCas object. - * @param jcas JCas object supplied by the pipeline + * tokenizes a given JCas object's document text using the treetagger program and adds the recognized tokens to the JCas object. + * + * @param jcas + * JCas object supplied by the pipeline */ private void tokenize(JCas jcas) { // read tokenized text to add tokens to the jcas - Logger.printDetail(component, "TreeTagger (tokenization) with: " + ttprops.abbFileName); - + LOG.debug("TreeTagger (tokenization) with: {}", ttprops.abbFileName); + EnumSet flags = Flag.getSet(ttprops.languageSwitch); - TreeTaggerTokenizer ttt; ttprops.abbFileName = "english-abbreviations"; - if(ttprops.abbFileName != null) { + TreeTaggerTokenizer ttt; + ttprops.abbFileName = "english-abbreviations"; + if (ttprops.abbFileName != null) { ttt = new TreeTaggerTokenizer(ttprops.rootPath + ttprops.fileSeparator + "lib" + ttprops.fileSeparator + ttprops.abbFileName, flags); } else { ttt = new TreeTaggerTokenizer(null, flags); } - + String docText = jcas.getDocumentText().replaceAll("\n\n", "\nEMPTYLINE\n"); List tokenized = ttt.tokenize(docText); - + int tokenOffset = 0; // loop through all the lines in the treetagger output - for(String s : tokenized) { + for (String s : tokenized) { // charset missmatch fallback: signal (invalid) s if ((!(s.equals("EMPTYLINE"))) && (jcas.getDocumentText().indexOf(s, tokenOffset) < 0)) { - Logger.printError(component, "Tokenization was interrupted because the token \"" + s - + "\" could not be found in the original text. The reason for this might be " - + "that the encoding of the document is not UTF-8. This token was skipped and " - + "if it was part of a temporal expression, will not be extracted."); + LOG.error("Tokenization was interrupted because the token \"" + s + "\" could not be found in the original text. The reason for this might be " + + "that the encoding of the document is not UTF-8. This token was skipped and " + "if it was part of a temporal expression, will not be extracted."); continue; } // create tokens and add them to the jcas's indexes. Token newToken = new Token(jcas); - if (s.equals("EMPTYLINE")){ + if (s.equals("EMPTYLINE")) { newToken.setBegin(tokenOffset); newToken.setEnd(tokenOffset); newToken.setPos("EMPTYLINE"); - if (annotate_partofspeech){ + if (annotate_partofspeech) newToken.addToIndexes(); - } - } - else{ + } else { newToken.setBegin(jcas.getDocumentText().indexOf(s, tokenOffset)); newToken.setEnd(newToken.getBegin() + s.length()); newToken.addToIndexes(); @@ -300,42 +298,42 @@ private void tokenize(JCas jcas) { } } } - + /** - * tokenizes a given JCas object's document text using the chinese tokenization - * script and adds the recognized tokens to the JCas object. - * @param jcas JCas object supplied by the pipeline + * tokenizes a given JCas object's document text using the chinese tokenization script and adds the recognized tokens to the JCas object. + * + * @param jcas + * JCas object supplied by the pipeline */ private void tokenizeChinese(JCas jcas) { try { // read tokenized text to add tokens to the jcas Process proc = ttprops.getChineseTokenizationProcess(); - Logger.printDetail(component, "Chinese tokenization: " + ttprops.chineseTokenizerPath); - + LOG.debug("Chinese tokenization: {}", ttprops.chineseTokenizerPath); + BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream(), "UTF-8")); BufferedWriter out = new BufferedWriter(new OutputStreamWriter(proc.getOutputStream(), "UTF-8")); - - Integer tokenOffset = 0; + + int tokenOffset = 0; // loop through all the lines in the stdout output String[] inSplits = jcas.getDocumentText().split("[\\r\\n]+"); - for(String inSplit : inSplits) { + for (String inSplit : inSplits) { out.write(inSplit); out.newLine(); out.flush(); - + // do one initial read String s = in.readLine(); do { // break out of the loop if we've read a null - if(s == null) + if (s == null) break; - + String[] outSplits = s.split("\\s+"); - for(String tok : outSplits) { - if(jcas.getDocumentText().indexOf(tok, tokenOffset) < 0) - throw new RuntimeException("Could not find token " + tok + - " in JCas after tokenizing with Chinese tokenization script."); - + for (String tok : outSplits) { + if (jcas.getDocumentText().indexOf(tok, tokenOffset) < 0) + throw new RuntimeException("Could not find token " + tok + " in JCas after tokenizing with Chinese tokenization script."); + // create tokens and add them to the jcas's indexes. Token newToken = new Token(jcas); newToken.setBegin(jcas.getDocumentText().indexOf(tok, tokenOffset)); @@ -343,15 +341,15 @@ private void tokenizeChinese(JCas jcas) { newToken.addToIndexes(); tokenOffset = newToken.getEnd(); } - + // break out of the loop if the next read will block - if(!in.ready()) + if (!in.ready()) break; - + s = in.readLine(); - } while(true); + } while (true); } - + // clean up in.close(); proc.destroy(); @@ -360,95 +358,92 @@ private void tokenizeChinese(JCas jcas) { } } - /** - * based on tokens from the jcas object, adds part of speech (POS) and sentence - * tags to the jcas object using the treetagger program. - * @param jcas JCas object supplied by the pipeline + * based on tokens from the jcas object, adds part of speech (POS) and sentence tags to the jcas object using the treetagger program. + * + * @param jcas + * JCas object supplied by the pipeline */ private void doTreeTag(JCas jcas) { try { - if(ttProc == null) { + if (ttProc == null) ttProc = new TreeTaggerProcess(ttprops.getTreeTaggingProcess()); - } - - Logger.printDetail(component, "TreeTagger (pos tagging) with: " + ttprops.parFileName); - - AnnotationIndex ai = jcas.getAnnotationIndex(Token.type); + + LOG.debug("TreeTagger (pos tagging) with: {}", ttprops.parFileName); + + AnnotationIndex ai = jcas.getAnnotationIndex(Token.type); List tokenStrings = new ArrayList<>(); List tokens = new ArrayList<>(); - for(FSIterator fsi = ai.iterator(); fsi.hasNext();) { - Token token = (Token) fsi.next(); + for (FSIterator fsi = ai.iterator(); fsi.hasNext();) { + Token token = fsi.next(); tokenStrings.add(token.getCoveredText()); tokens.add(token); } - + ttreader = new TreeTaggerReader(tokens, ttProc.getStdout(), jcas, annotate_sentences); ttwriter = new TreeTaggerWriter(tokenStrings, ttProc.getStdin()); - + Thread rThread = new Thread(ttreader); Thread wThread = new Thread(ttwriter); - + rThread.start(); wThread.start(); - + rThread.join(); wThread.join(); - } catch(IOException | InterruptedException e) { + } catch (IOException | InterruptedException e) { e.printStackTrace(); } } - /** - * based on tokens from the jcas object, adds part of speech (POS) and sentence - * tags to the jcas object using the treetagger program. - * @param jcas JCas object supplied by the pipeline + * based on tokens from the jcas object, adds part of speech (POS) and sentence tags to the jcas object using the treetagger program. + * + * @param jcas + * JCas object supplied by the pipeline */ - @SuppressWarnings({"unused"}) + @SuppressWarnings({ "unused" }) private void doTreeTagOld(JCas jcas) { File tmpDocument = null; BufferedWriter tmpFileWriter; ArrayList tokens = new ArrayList(); - + try { // create a temporary file and write our pre-existing tokens to it. tmpDocument = File.createTempFile("postokens", null); tmpFileWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpDocument), "UTF-8")); // iterate over existing tokens - FSIterator ai = jcas.getAnnotationIndex(Token.type).iterator(); - while(ai.hasNext()) { - Token t = (Token) ai.next(); - + AnnotationIndex toks = jcas.getAnnotationIndex(Token.type); + for (FSIterator ai = toks.iterator(); ai.hasNext();) { + Token t = ai.next(); tokens.add(t); - if (!(t.getBegin() == t.getEnd())){ + if (t.getBegin() != t.getEnd()) tmpFileWriter.write(t.getCoveredText() + ttprops.newLineSeparator); - } } - + tmpFileWriter.close(); - } catch(IOException e) { - Logger.printError("Something went wrong creating a temporary file for the treetagger to process."); + } catch (IOException e) { + LOG.error("Something went wrong creating a temporary file for the treetagger to process."); System.exit(-1); } // Possible End-of-Sentence Tags HashSet hsEndOfSentenceTag = new HashSet(); - hsEndOfSentenceTag.add("SENT"); // ENGLISH, FRENCH, GREEK, - hsEndOfSentenceTag.add("$."); // GERMAN, DUTCH - hsEndOfSentenceTag.add("FS"); // SPANISH + hsEndOfSentenceTag.add("SENT"); // ENGLISH, FRENCH, GREEK, + hsEndOfSentenceTag.add("$."); // GERMAN, DUTCH + hsEndOfSentenceTag.add("FS"); // SPANISH hsEndOfSentenceTag.add("_Z_Fst"); // ESTONIAN hsEndOfSentenceTag.add("_Z_Int"); // ESTONIAN hsEndOfSentenceTag.add("_Z_Exc"); // ESTONIAN hsEndOfSentenceTag.add("ew"); // CHINESE - + try { Process p = ttprops.getTreeTaggingProcess(tmpDocument); - Logger.printDetail(component, "TreeTagger (pos tagging) with: " + ttprops.parFileName); - + LOG.debug("TreeTagger (pos tagging) with: {}", ttprops.parFileName); + BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream(), "UTF-8")); - + Sentence sentence = null; // iterate over all the output lines and tokens array (which have the same source and are hence symmetric) int i = 0; @@ -457,7 +452,7 @@ private void doTreeTagOld(JCas jcas) { // grab a token Token token = tokens.get(i++); // modified (Aug 29, 2011): Handle empty tokens (such as empty lines) in input file - while (token.getCoveredText().equals("")){ + while (token.getCoveredText().equals("")) { // if part of the configuration, also add sentences to the jcas document if ((annotate_sentences) && (token.getPos() != null && token.getPos().equals("EMPTYLINE"))) { // Establish sentence structure @@ -465,55 +460,53 @@ private void doTreeTagOld(JCas jcas) { sentence = new Sentence(jcas); sentence.setBegin(token.getBegin()); } - + // Finish current sentence if end-of-sentence pos was found or document ended sentence.setEnd(token.getEnd()); - if (sentence.getBegin() < sentence.getEnd()){ + if (sentence.getBegin() < sentence.getEnd()) sentence.addToIndexes(); - } - + // Make sure current sentence is not active anymore so that a new one might be created sentence = null; -// sentence = new Sentence(jcas); + // sentence = new Sentence(jcas); } token.removeFromIndexes(); token = tokens.get(i++); } // remove tokens, otherwise they are in the index twice - token.removeFromIndexes(); + token.removeFromIndexes(); // set part of speech tag and add to indexes again - if (!(token.getCoveredText().equals(""))){ + if (!(token.getCoveredText().equals(""))) { token.setPos(s); token.addToIndexes(); } - + // if part of the configuration, also add sentences to the jcas document - if(annotate_sentences) { + if (annotate_sentences) { // Establish sentence structure if (sentence == null) { sentence = new Sentence(jcas); sentence.setBegin(token.getBegin()); } - + // Finish current sentence if end-of-sentence pos was found or document ended if (hsEndOfSentenceTag.contains(s) || i == tokens.size()) { sentence.setEnd(token.getEnd()); sentence.addToIndexes(); - + // Make sure current sentence is not active anymore so that a new one might be created sentence = null; } } } - while (i < tokens.size()){ - if (!(sentence == null)){ - sentence.setEnd(tokens.get(tokens.size()-1).getEnd()); + while (i < tokens.size()) { + if (sentence != null) { + sentence.setEnd(tokens.get(tokens.size() - 1).getEnd()); sentence.addToIndexes(); } Token token = tokens.get(i++); - if (token.getPos() != null && token.getPos().equals("EMPTYLINE")){ + if (token.getPos() != null && token.getPos().equals("EMPTYLINE")) token.removeFromIndexes(); - } } in.close(); p.destroy(); @@ -524,40 +517,35 @@ private void doTreeTagOld(JCas jcas) { tmpDocument.delete(); } } - + public void setHome(String home) { - this.ttprops.rootPath = home; + this.ttprops.rootPath = home; } - + private void improveFrenchSentences(JCas jcas) { - HashSet hsRemoveAnnotations = new HashSet(); - HashSet hsAddAnnotations = new HashSet(); - + HashSet hsRemoveAnnotations = new HashSet(); + HashSet hsAddAnnotations = new HashSet(); + HashSet hsSentenceBeginnings = new HashSet(); hsSentenceBeginnings.add("J.-C."); hsSentenceBeginnings.add("J-C."); hsSentenceBeginnings.add("NSJC"); - - Boolean changes = true; + + boolean changes = true; while (changes) { changes = false; - FSIndex annoHeidelSentences = jcas.getAnnotationIndex(de.unihd.dbs.uima.types.heideltime.Sentence.type); - FSIterator iterHeidelSent = annoHeidelSentences.iterator(); - while (iterHeidelSent.hasNext()){ - de.unihd.dbs.uima.types.heideltime.Sentence s1 = (de.unihd.dbs.uima.types.heideltime.Sentence) iterHeidelSent.next(); - - if ((s1.getCoveredText().endsWith("av.")) || - (s1.getCoveredText().endsWith("Av.")) || - (s1.getCoveredText().endsWith("apr.")) || - (s1.getCoveredText().endsWith("Apr.")) || - (s1.getCoveredText().endsWith("avant.")) || - (s1.getCoveredText().endsWith("Avant."))){ - if (iterHeidelSent.hasNext()){ - de.unihd.dbs.uima.types.heideltime.Sentence s2 = (de.unihd.dbs.uima.types.heideltime.Sentence) iterHeidelSent.next(); + FSIndex annoHeidelSentences = jcas.getAnnotationIndex(Sentence.type); + for (FSIterator iterHeidelSent = annoHeidelSentences.iterator(); iterHeidelSent.hasNext();) { + Sentence s1 = iterHeidelSent.next(); + + if ((s1.getCoveredText().endsWith("av.")) || (s1.getCoveredText().endsWith("Av.")) || (s1.getCoveredText().endsWith("apr.")) || (s1.getCoveredText().endsWith("Apr.")) + || (s1.getCoveredText().endsWith("avant.")) || (s1.getCoveredText().endsWith("Avant."))) { + if (iterHeidelSent.hasNext()) { + Sentence s2 = iterHeidelSent.next(); iterHeidelSent.moveToPrevious(); - for (String beg : hsSentenceBeginnings){ - if (s2.getCoveredText().startsWith(beg)){ - de.unihd.dbs.uima.types.heideltime.Sentence s3 = new de.unihd.dbs.uima.types.heideltime.Sentence(jcas); + for (String beg : hsSentenceBeginnings) { + if (s2.getCoveredText().startsWith(beg)) { + Sentence s3 = new Sentence(jcas); s3.setBegin(s1.getBegin()); s3.setEnd(s2.getEnd()); hsAddAnnotations.add(s3); @@ -569,143 +557,135 @@ private void improveFrenchSentences(JCas jcas) { } } } - - + } - for (de.unihd.dbs.uima.types.heideltime.Sentence s : hsRemoveAnnotations){ + for (Sentence s : hsRemoveAnnotations) s.removeFromIndexes(jcas); - } hsRemoveAnnotations.clear(); - for (de.unihd.dbs.uima.types.heideltime.Sentence s : hsAddAnnotations){ + for (Sentence s : hsAddAnnotations) s.addToIndexes(jcas); - } hsAddAnnotations.clear(); } } - - /** * improve german sentences; the treetagger splits german sentences incorrectly on some occasions - * @param jcas JCas object supplied by the pipeline + * + * @param jcas + * JCas object supplied by the pipeline */ private void improveGermanSentences(JCas jcas) { - /* - * these POS tag sequences will decide whether we want to merge two sentences - * that have (supposedly wrongfully) been split. + /* + * these POS tag sequences will decide whether we want to merge two sentences that have (supposedly wrongfully) been split. */ HashSet posRules = new HashSet(); - posRules.add(new String[] {"CARD", "\\$.", "NN"}); - posRules.add(new String[] {"CARD", "\\$.", "NE"}); - - FSIterator sentIter = jcas.getAnnotationIndex(Sentence.type).iterator(); - + posRules.add(new String[] { "CARD", "\\$.", "NN" }); + posRules.add(new String[] { "CARD", "\\$.", "NE" }); + + AnnotationIndex sentences = jcas.getAnnotationIndex(Sentence.type); + // compare two sentences at a time in order to have access to all POS tags HashSet> toMerge = new HashSet>(); Sentence prevSent = null, thisSent = null; - while(sentIter.hasNext()) { - if(thisSent == null) { - thisSent = (Sentence) sentIter.next(); + for (FSIterator sentIter = sentences.iterator(); sentIter.hasNext();) { + if (thisSent == null) { + thisSent = sentIter.next(); continue; } - + prevSent = thisSent; - thisSent = (Sentence) sentIter.next(); - /* - * select the last two tokens within the previous sentence as well as the - * first of the current one and check for matches. + thisSent = sentIter.next(); + /* + * select the last two tokens within the previous sentence as well as the first of the current one and check for matches. */ Token penultimateToken = null, ultimateToken = null, firstToken = null; - FSIterator tokIter = jcas.getAnnotationIndex(Token.type).subiterator(thisSent); - if(tokIter.hasNext()) { - firstToken = (Token) tokIter.next(); - } - - tokIter = jcas.getAnnotationIndex(Token.type).subiterator(prevSent); - while(tokIter.hasNext()) { - if(ultimateToken == null) { - ultimateToken = (Token) tokIter.next(); - continue; - } + AnnotationIndex toks = jcas.getAnnotationIndex(Token.type); + FSIterator tokIter = toks.subiterator(thisSent); + if (tokIter.hasNext()) + firstToken = tokIter.next(); + + tokIter = toks.subiterator(prevSent); + while (tokIter.hasNext()) { penultimateToken = ultimateToken; - ultimateToken = (Token) tokIter.next(); + ultimateToken = tokIter.next(); } - + // check that all tokens for further analysis are present. if not: skip - if(penultimateToken == null || ultimateToken == null || firstToken == null) { + if (penultimateToken == null || ultimateToken == null || firstToken == null) continue; - } - + // check rules, memorize sentences to be merged - for(String[] posRule : posRules) { - /* - * either one of the pre-defined POS rules fit, or the first token's - * covered text begins with lower case characters. + for (String[] posRule : posRules) { + /* + * either one of the pre-defined POS rules fit, or the first token's covered text begins with lower case characters. */ - if((penultimateToken.getPos() != null && penultimateToken.getPos().matches(posRule[0]) && - ultimateToken.getPos() != null && ultimateToken.getPos().matches(posRule[1]) && - firstToken.getPos() != null && firstToken.getPos().matches(posRule[2])) - || - (firstToken.getCoveredText().matches("^[a-z/].*"))) { - /* - * check whether one of the previous candidate pairs already - * contains one of our sentences. + if ((penultimateToken.getPos() != null && penultimateToken.getPos().matches(posRule[0]) && ultimateToken.getPos() != null && ultimateToken.getPos().matches(posRule[1]) + && firstToken.getPos() != null && firstToken.getPos().matches(posRule[2])) || firstToken.getCoveredText().matches("^[a-z/].*")) { + /* + * check whether one of the previous candidate pairs already contains one of our sentences. */ - Boolean candidateExisted = false; - for(HashSet mergeCandidate : toMerge) { - if(mergeCandidate.contains(thisSent) || mergeCandidate.contains(prevSent)) { + boolean candidateExisted = false; + for (HashSet mergeCandidate : toMerge) + if (mergeCandidate.contains(thisSent) || mergeCandidate.contains(prevSent)) { // we add both here because sets ignore duplicates mergeCandidate.add(prevSent); mergeCandidate.add(thisSent); - + candidateExisted = true; break; } - } - - /* - * if one of the sentences was not already to be merged with another, - * create a new merge candidate set + + /* + * if one of the sentences was not already to be merged with another, create a new merge candidate set */ - if(!candidateExisted) { + if (!candidateExisted) { HashSet newCandidate = new HashSet(); newCandidate.add(prevSent); newCandidate.add(thisSent); - + toMerge.add(newCandidate); } - + break; // don't need to do the next rules; already merging. } } } - + // iterate over the previously collected merge candidates - - for(HashSet mergeCandidate : toMerge) { + + for (HashSet mergeCandidate : toMerge) { // find the earliest beginning and latest end for the set of sentences - Integer beginIndex = Integer.MAX_VALUE, endIndex = Integer.MIN_VALUE; + int beginIndex = Integer.MAX_VALUE, endIndex = Integer.MIN_VALUE; Sentence mergedSent = new Sentence(jcas); - for(Sentence s : mergeCandidate) { - if(s.getBegin() < beginIndex) { + for (Sentence s : mergeCandidate) { + if (s.getBegin() < beginIndex) beginIndex = s.getBegin(); - } - - if(s.getEnd() > endIndex) { + if (s.getEnd() > endIndex) endIndex = s.getEnd(); - } - s.removeFromIndexes(); } - + // set values, add to jcas mergedSent.setBegin(beginIndex); mergedSent.setEnd(endIndex); mergedSent.addToIndexes(); } } - + + private void renumberSentences(JCas jcas) { + AnnotationIndex sentences = jcas.getAnnotationIndex(Sentence.type); + ArrayList sents = new ArrayList<>(sentences.size()); + for (FSIterator it = sentences.iterator(); it.hasNext();) + sents.add(it.next()); + int i = 0; + for (Sentence s : sents) { + s.removeFromIndexes(jcas); + s.setSentenceId(++i); + s.addToIndexes(jcas); + } + } + public void quit() { ttProc.close(); ttProc = null; diff --git a/src/de/unihd/dbs/uima/consumer/eventi2014writer/Eventi2014Writer.java b/src/de/unihd/dbs/uima/consumer/eventi2014writer/Eventi2014Writer.java index 4dc8454e..00a88837 100644 --- a/src/de/unihd/dbs/uima/consumer/eventi2014writer/Eventi2014Writer.java +++ b/src/de/unihd/dbs/uima/consumer/eventi2014writer/Eventi2014Writer.java @@ -30,15 +30,17 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceProcessException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; import de.unihd.dbs.uima.types.heideltime.Dct; import de.unihd.dbs.uima.types.heideltime.Timex3; import de.unihd.dbs.uima.types.heideltime.Timex3Interval; import de.unihd.dbs.uima.types.heideltime.Token; public class Eventi2014Writer extends CasConsumer_ImplBase { - private Class component = this.getClass(); + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(Eventi2014Writer.class); private static final String PARAM_OUTPUTDIR = "OutputDir"; @@ -52,13 +54,13 @@ public void initialize() throws ResourceInitializationException { if (!mOutputDir.exists()) { if(!mOutputDir.mkdirs()) { - Logger.printError(component, "Couldn't create non-existant folder "+mOutputDir.getAbsolutePath()); + LOG.error("Couldn't create non-existant folder "+mOutputDir.getAbsolutePath()); throw new ResourceInitializationException(); } } if(!mOutputDir.canWrite()) { - Logger.printError(component, "Folder "+mOutputDir.getAbsolutePath()+" is not writable."); + LOG.error("Folder "+mOutputDir.getAbsolutePath()+" is not writable."); throw new ResourceInitializationException(); } } @@ -234,14 +236,12 @@ private void writeDocument(String fullDocument, String filename) { bw.append(fullDocument); } catch (IOException e) { // something went wrong with the bufferedwriter - e.printStackTrace(); - Logger.printError(component, "File "+outFile.getAbsolutePath()+" could not be written."); + LOG.error("File "+outFile.getAbsolutePath()+" could not be written.", e); } finally { // clean up for the bufferedwriter try { bw.close(); } catch(IOException e) { - e.printStackTrace(); - Logger.printError(component, "File "+outFile.getAbsolutePath()+" could not be closed."); + LOG.error("File "+outFile.getAbsolutePath()+" could not be closed.", e); } } } diff --git a/src/de/unihd/dbs/uima/consumer/tempeval3writer/TempEval3Writer.java b/src/de/unihd/dbs/uima/consumer/tempeval3writer/TempEval3Writer.java index df8fba13..c9f8dd3c 100644 --- a/src/de/unihd/dbs/uima/consumer/tempeval3writer/TempEval3Writer.java +++ b/src/de/unihd/dbs/uima/consumer/tempeval3writer/TempEval3Writer.java @@ -23,15 +23,17 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceProcessException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Element; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; import de.unihd.dbs.uima.types.heideltime.Dct; import de.unihd.dbs.uima.types.heideltime.Timex3; public class TempEval3Writer extends CasConsumer_ImplBase { - private Class component = this.getClass(); + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(TempEval3Writer.class); private static final String PARAM_OUTPUTDIR = "OutputDir"; @@ -45,13 +47,13 @@ public void initialize() throws ResourceInitializationException { if (!mOutputDir.exists()) { if(!mOutputDir.mkdirs()) { - Logger.printError(component, "Couldn't create non-existant folder "+mOutputDir.getAbsolutePath()); + LOG.error("Couldn't create non-existant folder "+mOutputDir.getAbsolutePath()); throw new ResourceInitializationException(); } } if(!mOutputDir.canWrite()) { - Logger.printError(component, "Folder "+mOutputDir.getAbsolutePath()+" is not writable."); + LOG.error("Folder "+mOutputDir.getAbsolutePath()+" is not writable."); throw new ResourceInitializationException(); } } @@ -99,8 +101,7 @@ private Document buildTimeMLDocument(JCas jcas, Dct dct, String filename) { db = dbf.newDocumentBuilder(); doc = db.newDocument(); } catch (ParserConfigurationException e) { - e.printStackTrace(); - Logger.printError(component, "XML Builder could not be instantiated"); + LOG.error("XML Builder could not be instantiated", e); } // create the TimeML root element @@ -166,7 +167,7 @@ private Document buildTimeMLDocument(JCas jcas, Dct dct, String filename) { prevT = thisT; // this iteration's prevT was removed; setting for new iteration } - Logger.printError(component, "Two overlapping Timexes have been discovered:" + System.getProperty("line.separator") + LOG.error("Two overlapping Timexes have been discovered:" + System.getProperty("line.separator") + "Timex A: " + prevT.getCoveredText() + " [\"" + prevT.getTimexValue() + "\" / " + prevT.getBegin() + ":" + prevT.getEnd() + "]" + System.getProperty("line.separator") + "Timex B: " + removedT.getCoveredText() + " [\"" + removedT.getTimexValue() + "\" / " + removedT.getBegin() + ":" + removedT.getEnd() + "]" @@ -253,17 +254,14 @@ private void writeTimeMLDocument(Document xmlDoc, String filename) { // transform transformer.transform(source, result); } catch (IOException e) { // something went wrong with the bufferedwriter - e.printStackTrace(); - Logger.printError(component, "File "+outFile.getAbsolutePath()+" could not be written."); + LOG.error("File "+outFile.getAbsolutePath()+" could not be written.", e); } catch (TransformerException e) { // the transformer malfunctioned (call optimus prime) - e.printStackTrace(); - Logger.printError(component, "XML transformer could not be properly initialized."); + LOG.error("XML transformer could not be properly initialized.", e); } finally { // clean up for the bufferedwriter try { bw.close(); } catch(IOException e) { - e.printStackTrace(); - Logger.printError(component, "File "+outFile.getAbsolutePath()+" could not be closed."); + LOG.error("File "+outFile.getAbsolutePath()+" could not be closed.", e); } } } diff --git a/src/de/unihd/dbs/uima/reader/eventi2014reader/Eventi2014Reader.java b/src/de/unihd/dbs/uima/reader/eventi2014reader/Eventi2014Reader.java index 30e5dee0..979b7bbe 100644 --- a/src/de/unihd/dbs/uima/reader/eventi2014reader/Eventi2014Reader.java +++ b/src/de/unihd/dbs/uima/reader/eventi2014reader/Eventi2014Reader.java @@ -21,7 +21,7 @@ import java.util.HashSet; import java.util.LinkedList; import java.util.Queue; -import java.util.regex.MatchResult; +import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.uima.cas.CAS; @@ -33,9 +33,9 @@ import org.apache.uima.util.FileUtils; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Toolbox; import de.unihd.dbs.uima.types.heideltime.Dct; import de.unihd.dbs.uima.types.heideltime.Sentence; import de.unihd.dbs.uima.types.heideltime.Token; @@ -44,7 +44,8 @@ * CollectionReader for TempEval Data */ public class Eventi2014Reader extends CollectionReader_ImplBase { - private Class component = this.getClass(); + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(Eventi2014Reader.class); // uima descriptor parameter name private String PARAM_INPUTDIR = "InputDirectory"; @@ -54,10 +55,10 @@ public class Eventi2014Reader extends CollectionReader_ImplBase { // For improving the formatting of the documentText // -> to not have a space between all the tokens // HashSet containing tokens in front of which no white space is added - private HashSet hsNoSpaceBefore = new HashSet(); - private HashSet hsNoSpaceBehind = new HashSet(); + private HashSet hsNoSpaceBefore = new HashSet<>(); + private HashSet hsNoSpaceBehind = new HashSet<>(); - private Queue files = new LinkedList(); + private Queue files = new LinkedList<>(); public void initialize() throws ResourceInitializationException { String dirPath = (String) getConfigParameterValue(PARAM_INPUTDIR); @@ -113,24 +114,21 @@ private void fillJCas(JCas jcas) throws IOException, CollectionException { String lastTok = ""; int sentBegin = 0; int sentEnd = -1; - + + Pattern paConstraint = Pattern.compile(""); + Pattern paToken = Pattern.compile("(.*?)"); + Pattern paTimex3 = Pattern.compile("()"); + for (String line : lines) { - - // get document name - if (line.startsWith(""); - for (MatchResult mr : Toolbox.findMatches(paConstraint,line)) { + // get document name + if (line.startsWith("(.*?)"); - for (MatchResult mr : Toolbox.findMatches(paConstraint,line)) { - + for (Matcher mr = paToken.matcher(line); mr.find(); ) { String token = mr.group(4); // System.err.println("INPUT: -->" + token + "<--"); int tokID = Integer.parseInt(mr.group(1)); @@ -165,7 +163,7 @@ private void fillJCas(JCas jcas) throws IOException, CollectionException { // } else{ // tokens without space behind the tokens - if (!(hsNoSpaceBehind.contains(lastTok))){ + if (!hsNoSpaceBehind.contains(lastTok)){ tokBegin = text.length()+ 1; text = text + " " + token; } @@ -194,11 +192,10 @@ private void fillJCas(JCas jcas) throws IOException, CollectionException { // get the document creation time if (line.startsWith(")"); - for (MatchResult mr : Toolbox.findMatches(paConstraint,line)) { + for (Matcher mr = paTimex3.matcher(line); mr.find(); ) { fullDctTag = mr.group(1); dct = mr.group(2); - System.err.println("DCT: " + dct); + LOG.debug("DCT: {}", dct); } } } @@ -207,7 +204,7 @@ private void fillJCas(JCas jcas) throws IOException, CollectionException { jcas.setDocumentText(text); // add DCT to jcas - if (!(dct.equals(""))){ + if (!dct.equals("")){ Dct dctAnnotation = new Dct(jcas); dctAnnotation.setBegin(0); dctAnnotation.setEnd(text.length()); @@ -260,7 +257,7 @@ private void populateFileList(String dirPath) throws ResourceInitializationExcep // check for existence and readability; add handle to the list for(File f : myFiles) { if(!f.exists() || !f.isFile() || !f.canRead()) { - Logger.printDetail(component, "File \""+f.getAbsolutePath()+"\" was ignored because it either didn't exist, wasn't a file or wasn't readable."); + LOG.debug("File \"{}\" was ignored because it either didn't exist, wasn't a file or wasn't readable.", f.getAbsolutePath()); } else { files.add(f); } diff --git a/src/de/unihd/dbs/uima/reader/tempeval3reader/Tempeval3Reader.java b/src/de/unihd/dbs/uima/reader/tempeval3reader/Tempeval3Reader.java index fe302a0b..815824d7 100644 --- a/src/de/unihd/dbs/uima/reader/tempeval3reader/Tempeval3Reader.java +++ b/src/de/unihd/dbs/uima/reader/tempeval3reader/Tempeval3Reader.java @@ -21,13 +21,14 @@ import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; import de.unihd.dbs.uima.types.heideltime.Dct; /** @@ -35,7 +36,8 @@ * */ public class Tempeval3Reader extends CollectionReader_ImplBase { - private Class component = this.getClass(); + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(Tempeval3Reader.class); // uima descriptor parameter name private String PARAM_INPUTDIR = "InputDirectory"; @@ -130,8 +132,7 @@ private void fillJCas(JCas jcas) { dct.setTimexId("t0"); dct.addToIndexes(); } catch(Exception e) { - e.printStackTrace(); - Logger.printError(component, "File "+f.getAbsolutePath()+" could not be properly parsed."); + LOG.error("File "+f.getAbsolutePath()+" could not be properly parsed.", e); } } @@ -160,7 +161,7 @@ private void populateFileList(String dirPath) throws ResourceInitializationExcep // check for existence and readability; add handle to the list for(File f : myFiles) { if(!f.exists() || !f.isFile() || !f.canRead()) { - Logger.printDetail(component, "File \""+f.getAbsolutePath()+"\" was ignored because it either didn't exist, wasn't a file or wasn't readable."); + LOG.debug("File \"{}\" was ignored because it either didn't exist, wasn't a file or wasn't readable.", f.getAbsolutePath()); } else { files.add(f); } diff --git a/src/hr/fer/zemris/takelab/uima/annotator/hunpos/HunPosAnnotionTranslator.java b/src/hr/fer/zemris/takelab/uima/annotator/hunpos/HunPosAnnotionTranslator.java index 19156bdf..37b346dd 100644 --- a/src/hr/fer/zemris/takelab/uima/annotator/hunpos/HunPosAnnotionTranslator.java +++ b/src/hr/fer/zemris/takelab/uima/annotator/hunpos/HunPosAnnotionTranslator.java @@ -12,9 +12,12 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class HunPosAnnotionTranslator { + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(HunPosAnnotionTranslator.class); private List mappings; @@ -43,28 +46,28 @@ private void loadTranslations() { Matcher m = reRule.matcher(line); if(!m.matches()) { - Logger.printError("Error matching HunPos annotation translation rule : " + line); + LOG.error("Error matching HunPos annotation translation rule : " + line); continue; } try { mappings.add(new HunPosAnnotationMapping(m.group(1), m.group(2))); } catch (Exception e) { - Logger.printError("Invalid regex in HunPos annotation matching rule " + m.group(1)); + LOG.error("Invalid regex in HunPos annotation matching rule " + m.group(1), e); continue; } } } catch (FileNotFoundException e) { - Logger.printError("Cannot find the HunPos annotation translation rules file."); + LOG.error("Cannot find the HunPos annotation translation rules file.", e); } catch (IOException e) { - Logger.printError("Error reading HunPos annotation translation rules file."); + LOG.error("Error reading HunPos annotation translation rules file.", e); } finally { try { if(reader != null) { reader.close(); } } catch (IOException e) { - Logger.printError("An error occured while closing the file."); + LOG.error("An error occured while closing the file.", e); } } diff --git a/src/hr/fer/zemris/takelab/uima/annotator/hunpos/HunPosTaggerWrapper.java b/src/hr/fer/zemris/takelab/uima/annotator/hunpos/HunPosTaggerWrapper.java index 90f93b6f..445c4ff3 100644 --- a/src/hr/fer/zemris/takelab/uima/annotator/hunpos/HunPosTaggerWrapper.java +++ b/src/hr/fer/zemris/takelab/uima/annotator/hunpos/HunPosTaggerWrapper.java @@ -27,9 +27,10 @@ import org.apache.uima.resource.ConfigurationManager; import org.apache.uima.resource.impl.ConfigurationManager_impl; import org.apache.uima.resource.impl.ResourceManager_impl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import de.unihd.dbs.uima.annotator.heideltime.resources.Language; -import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger; import de.unihd.dbs.uima.types.heideltime.Sentence; import de.unihd.dbs.uima.types.heideltime.Token; @@ -41,6 +42,8 @@ * */ public class HunPosTaggerWrapper extends JCasAnnotator_ImplBase{ + /** Class logger */ + private static final Logger LOG = LoggerFactory.getLogger(HunPosTaggerWrapper.class); public static final String PARAM_LANGUAGE = "language"; public static final String PARAM_PATH = "hunpos_path"; @@ -164,7 +167,7 @@ public static void initialize(String modelPath, String hunposPath) { } if(hunposRoot == null || !new File(hunposRoot).exists()) { - Logger.printError(HunPosWrapper.class, "The environment variable HUNPOS_HOME was not set, or set to \"" + hunposRoot + "\", which does not exist."); + LOG.error("The environment variable HUNPOS_HOME was not set, or set to \"" + hunposRoot + "\", which does not exist."); System.exit(-1); } File hunPosRootFile = new File(hunposRoot); @@ -176,7 +179,7 @@ public static void initialize(String modelPath, String hunposPath) { if(modelFile.exists()) { command.add(modelFile.getAbsolutePath()); } else { - Logger.printError(HunPosWrapper.class, "The supplied model path " + modelPath + " does not exist."); + LOG.error("The supplied model path " + modelPath + " does not exist."); System.exit(-1); } } @@ -190,13 +193,12 @@ public static void tagPOS(JCas jCas, boolean tagSentences) { try { p = Runtime.getRuntime().exec(cmd); } catch (IOException e2) { - Logger.printError(HunPosWrapper.class, "An error occured while trying to call HunPos at " + System.getenv(HUNPOS_HOME)); - e2.printStackTrace(); + LOG.error("An error occured while trying to call HunPos at " + System.getenv(HUNPOS_HOME), e2); } Writer writer = new OutputStreamWriter(p.getOutputStream()); - Logger.printDetail(HunPosWrapper.class, "Starting the POS tagging process."); + LOG.debug("Starting the POS tagging process."); final List tokens = new ArrayList(); diff --git a/test/de/unihd/dbs/heideltime/test/english/AbstractHeideltimeTest.java b/test/de/unihd/dbs/heideltime/test/english/AbstractHeideltimeTest.java new file mode 100644 index 00000000..a481b216 --- /dev/null +++ b/test/de/unihd/dbs/heideltime/test/english/AbstractHeideltimeTest.java @@ -0,0 +1,153 @@ +package de.unihd.dbs.heideltime.test.english; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.uima.UIMAFramework; +import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.metadata.TypeSystemDescription; +import org.apache.uima.util.XMLInputSource; +import org.junit.Before; + +import de.unihd.dbs.heideltime.standalone.Config; +import de.unihd.dbs.heideltime.standalone.HeidelTimeStandalone; +import de.unihd.dbs.heideltime.standalone.components.impl.JCasFactoryImpl; +import de.unihd.dbs.heideltime.standalone.components.impl.UimaContextImpl; +import de.unihd.dbs.uima.annotator.heideltime.DocumentType; +import de.unihd.dbs.uima.annotator.heideltime.HeidelTime; +import de.unihd.dbs.uima.annotator.heideltime.resources.Language; +import de.unihd.dbs.uima.types.heideltime.Dct; +import de.unihd.dbs.uima.types.heideltime.Sentence; +import de.unihd.dbs.uima.types.heideltime.Timex3; +import de.unihd.dbs.uima.types.heideltime.Token; + +/** + * Abstract base class for unit testing Heideltime annotations. + * + * @author Erich Schubert + */ +public class AbstractHeideltimeTest { + + protected JCasFactoryImpl jcasFactory; + protected HeidelTime heideltime; + private boolean debugTokenization = false; + static final Pattern LINEWRAP = Pattern.compile("\\s*[\\n\\r]+\\s*"); + static final Pattern WORDS = Pattern.compile("(?U)([^\\s\\w]*)([\\w/]+(?:\\.\\d+)?)([^\\s\\w]*)"); + + @Before + public void init() { + try { + if (!Config.isInitialized()) + HeidelTimeStandalone.readConfigFile("test/test.props"); + TypeSystemDescription[] descriptions = new TypeSystemDescription[] { + UIMAFramework.getXMLParser().parseTypeSystemDescription(new XMLInputSource(this.getClass().getClassLoader().getResource(Config.get(Config.TYPESYSTEMHOME)))) }; + jcasFactory = new JCasFactoryImpl(descriptions); + heideltime = new HeidelTime(); + heideltime.initialize(new UimaContextImpl(Language.ENGLISH, DocumentType.COLLOQUIAL, false)); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + public AbstractHeideltimeTest() { + super(); + } + + protected JCas tokenize(String fragment) { + JCas jcas = null; + try { + jcas = jcasFactory.createJCas(); + jcas.setDocumentText(fragment); + } catch (Exception e) { + fail("Cas object could not be generated"); + } + int last = 0; + for (Matcher sm = LINEWRAP.matcher(fragment); sm.find();) { + int ss = sm.start(), se = sm.end(); + if (last < ss) + tokenizeSentence(fragment, jcas, last, ss); + last = se; + } + if (last < fragment.length()) + tokenizeSentence(fragment, jcas, last, fragment.length()); + return jcas; + } + + private void tokenizeSentence(String fragment, JCas jcas, int ss, int se) { + // A single sentence: + Sentence s = new Sentence(jcas); + s.setBegin(ss); + s.setEnd(se); + s.addToIndexes(); + // Hard-coded tokenization: + for (Matcher m = WORDS.matcher(fragment).region(ss, se); m.find();) { + for (int i = 1; i <= 3; i++) { + int start = m.start(i), end = m.end(i); + if (start == end) + continue; + Token t = new Token(jcas); + t.setBegin(start); + t.setEnd(end); + t.setPos(""); + t.addToIndexes(); + if (debugTokenization) + System.out.print(fragment.substring(start, end) + "<=>"); + } + } + if (debugTokenization) + System.out.println(); + } + + protected JCas analyze(String fragment, String dctv) { + try { + JCas jcas = tokenize(fragment); + if (dctv != null) { + Dct dct = new Dct(jcas); + dct.setValue(dctv); + dct.addToIndexes(); + } + heideltime.process(jcas); + // intervaltagger.process(jcas); + return jcas; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + protected void testSingleCase(String fragment, String[]... expectf) { + testSingleCase(fragment, null, expectf); + } + + protected void testSingleCase(String fragment, String dctv, String[]... expectf) { + JCas jcas = analyze(fragment, dctv); + AnnotationIndex times = jcas.getAnnotationIndex(Timex3.type); + int cnt = 0; + for (Timex3 timex3 : times) { + ++cnt; + String mrule = timex3.getFoundByRule().replaceAll("-(relative|explicit)", ""); + String mstr = fragment.substring(timex3.getBegin(), timex3.getEnd()); + String mres = timex3.getTimexValue(); + boolean samerule = false, samestring = false, sameres = false; + for (String[] expect : expectf) { + samerule |= expect[0].equals(mrule); + samestring |= (expect.length > 1 ? expect[1] : "").equals(mstr); + sameres |= (expect.length > 2) ? expect[2].equals(mres) : false; + } + if (!samerule || !samestring || !sameres) { + System.err.println("Received: " + timex3); + for (String[] expect : expectf) { + System.err.println("Expected: " + String.join("\t", expect)); + } + } + assertTrue("Fragment >>" + fragment + "<< matched in a different part: >>" + mstr + "<< (rule " + mrule + ")", samestring); + assertTrue("Fragment >>" + fragment + "<< returned a different result: >>" + mres + "<< (rule " + mrule + ")", sameres); + assertTrue("Fragment >>" + fragment + "<< matched by different rule: " + mrule, samerule); + } + assertEquals("Number of results do not match.", expectf.length, cnt); + } +} \ No newline at end of file diff --git a/test/de/unihd/dbs/heideltime/test/english/EnglishDateHistoricTest.java b/test/de/unihd/dbs/heideltime/test/english/EnglishDateHistoricTest.java new file mode 100644 index 00000000..a8f7e80f --- /dev/null +++ b/test/de/unihd/dbs/heideltime/test/english/EnglishDateHistoricTest.java @@ -0,0 +1,218 @@ +package de.unihd.dbs.heideltime.test.english; + +import org.apache.uima.UIMAFramework; +import org.apache.uima.resource.metadata.TypeSystemDescription; +import org.apache.uima.util.XMLInputSource; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; + +import de.unihd.dbs.heideltime.standalone.Config; +import de.unihd.dbs.heideltime.standalone.HeidelTimeStandalone; +import de.unihd.dbs.heideltime.standalone.components.impl.JCasFactoryImpl; +import de.unihd.dbs.heideltime.standalone.components.impl.UimaContextImpl; +import de.unihd.dbs.uima.annotator.heideltime.DocumentType; +import de.unihd.dbs.uima.annotator.heideltime.HeidelTime; +import de.unihd.dbs.uima.annotator.heideltime.resources.Language; + +public class EnglishDateHistoricTest extends AbstractHeideltimeTest { + @Before + public void init() { + try { + if (!Config.isInitialized()) + HeidelTimeStandalone.readConfigFile("test/test.props"); + TypeSystemDescription[] descriptions = new TypeSystemDescription[] { + UIMAFramework.getXMLParser().parseTypeSystemDescription(new XMLInputSource(this.getClass().getClassLoader().getResource(Config.get(Config.TYPESYSTEMHOME)))) }; + jcasFactory = new JCasFactoryImpl(descriptions); + heideltime = new HeidelTime(); + heideltime.initialize(new UimaContextImpl(Language.ENGLISH, DocumentType.NARRATIVE, false)); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + @Test + public void testdate_historic_1a_BCADhint() { + // 1- to 4-digit year + testSingleCase("190 BC", // + new String[] { "date_historic_1a-BCADhint", "190 BC", "BC0190" }); + } + + @Test + public void testdate_historic_1b_BCADhint() { + // 1- to 4-digit year + testSingleCase("BC 190", // + new String[] { "date_historic_1b-BCADhint", "BC 190", "BC0190" }); + } + + @Test + public void testdate_historic_1c_BCADhint() { + // find "190 BC"; 1- to 4-digit year + testSingleCase("190 or 180 BC", // + new String[] { "x_date_historic_1c-BCADhint", "190", "BC0190" }, // + new String[] { "date_historic_1a-BCADhint", "180 BC", "BC0180" }); + } + + @Test + public void testdate_historic_2a_BCADhint() { + // 1- to 4-digit year + testSingleCase("March 190 BC", // + new String[] { "date_historic_2a-BCADhint", "March 190 BC", "BC0190-03" }); + } + + @Test + public void testdate_historic_2b() { + // 3-digit year + testSingleCase("March 190", // + new String[] { "date_historic_2b", "March 190", "0190-03" }); + } + + @Test + public void testdate_historic_2c() { + // 2-digit year + testSingleCase("in March 90", new String[] { "date_historic_2c", "March 90", "0090-03" }); + } + + @Test + public void testdate_historic_2d() { + // 2-digit year + testSingleCase("March of 90", new String[] { "date_historic_2d", "March of 90", "0090-03" }); + } + + @Test + public void testdate_historic_3a_BCADhint() { + // 1- to 4-digit year + testSingleCase("March 29, 190 BC", // + new String[] { "date_historic_3a-BCADhint", "March 29, 190 BC", "BC0190-03-29" }); + } + + @Test + public void testdate_historic_3b_BCADhint() { + // 1- to 4-digit year + testSingleCase("29 March 190 BC", // + new String[] { "date_historic_3b-BCADhint", "29 March 190 BC", "BC0190-03-29" }); + } + + @Test + public void testdate_historic_3c_BCADhint() { + // 1- to 4-digit year + testSingleCase("29th of March 190 BC", // + new String[] { "date_historic_3c-BCADhint", "29th of March 190 BC", "BC0190-03-29" }); + } + + @Test + public void testdate_historic_3d() { + // 3-digit year + testSingleCase("March 29, 190", // + new String[] { "date_historic_3d", "March 29, 190", "0190-03-29" }); + } + + @Test + public void testdate_historic_3e() { + // 2-digit year + testSingleCase("March 29, 90", // + new String[] { "date_historic_3e", "March 29, 90", "0090-03-29" }); + } + + @Test + public void testdate_historic_4a_BCADhint() { + // 1- to 4-digit year + testSingleCase("summer of 190 BC", // + new String[] { "date_historic_4a-BCADhint", "summer of 190 BC", "BC0190-SU" }); + } + + @Test + public void testdate_historic_5a_BCADhint() { + testSingleCase("the 2nd century BC", // + new String[] { "date_historic_5a-BCADhint", "the 2nd century BC", "BC01" }); + } + + @Test + public void testdate_historic_5b_BCADhint() { + testSingleCase("beginning of the 2nd century BC", // + new String[] { "date_historic_5b-BCADhint", "beginning of the 2nd century BC", "BC01" }); + } + + @Test + public void testdate_historic_5ca_BCADhint() { + // find "2nd century BC" + testSingleCase("2nd or 3rd century BC", // + new String[] { "date_historic_5c-BCADhint", "2nd", "BC01" }, // + new String[] { "date_historic_5a-BCADhint", "3rd century BC", "BC02" }); + } + + @Test + public void testdate_historic_5ad_BCADhint() { + // find "beginning 2nd century BC" + testSingleCase("beginning of the 2nd or 3rd century BC", // + new String[] { "date_historic_5d-BCADhint", "beginning of the 2nd", "BC01" }, // + new String[] { "date_historic_5a-BCADhint", "3rd century BC", "BC02" }); + } + + @Test + public void testdate_historic_6a_BCADhint() { + testSingleCase("1990s BC", // + new String[] { "date_historic_6a-BCADhint", "1990s BC", "BC199" }); + } + + @Test + public void testdate_historic_6b_BCADhint() { + testSingleCase("190s BC", // + new String[] { "date_historic_6b-BCADhint", "190s BC", "BC019" }); + } + + @Test + public void testdate_historic_6c_BCADhint() { + testSingleCase("90s BC", // + new String[] { "date_historic_6c-BCADhint", "90s BC", "BC009" }); + } + + @Test + public void testdate_historic_7ab() { + // 3-digit year + testSingleCase("in 190", new String[] { "date_historic_7ab", "190", "0190" }); + } + + @Ignore("Disabled, as this is also matched by the regular year pattern") + @Test + public void testdate_historic_7c() { + testSingleCase("\n190\n", new String[] { "date_historic_7c", "190", "0190" }); + } + + @Test + public void testdate_historic_7d() { + // 2-digit year + testSingleCase("year of 90", // + new String[] { "date_historic_7d", "year of 90", "0090" }); + } + + @Test + public void testdate_historic_7e() { + // 3-digit year + testSingleCase("year of 190", // + new String[] { "date_historic_7e", "year of 190", "0190" }); + } + + @Test + public void testdate_historic_8ab() { + // 2-digit year + testSingleCase("in 90,", new String[] { "date_historic_8ab", "90", "0090" }); + testSingleCase("in 90", new String[] { "date_historic_8ab", "90", "0090" }); + } + + // FIXME: add POS tags for unit test + @Ignore("Needs POS") + @Test + public void testdate_historic_0ab_negative() { + // 2- to 4-digit year + testSingleCase("in 90 cases"); + testSingleCase("in 90 nice cases"); + testSingleCase("in 90 nice law cases"); + } + + @Test + public void testdate_historic_0d_negative() { + // 2- to 4-digit year + testSingleCase("in 90 percent"); // EMPTY! + } +} diff --git a/test/de/unihd/dbs/heideltime/test/english/EnglishDateTest.java b/test/de/unihd/dbs/heideltime/test/english/EnglishDateTest.java new file mode 100644 index 00000000..467deedf --- /dev/null +++ b/test/de/unihd/dbs/heideltime/test/english/EnglishDateTest.java @@ -0,0 +1,712 @@ +package de.unihd.dbs.heideltime.test.english; + +import org.junit.Ignore; +import org.junit.Test; + +public class EnglishDateTest extends AbstractHeideltimeTest { + @Test + public void testdate_r0a() { + testSingleCase("2010-01-29", // + new String[] { "date_r0a", "2010-01-29", "2010-01-29" }); + } + + @Test + public void testdate_r0b() { + testSingleCase("10-29-99", // + new String[] { "date_r0b", "10-29-99", "1999-10-29" }); + } + + @Test + public void testdate_r0c() { + testSingleCase("09/26/1999", // + new String[] { "date_r0c", "09/26/1999", "1999-09-26" }); + } + + @Test + public void testdate_r0d() { + testSingleCase("09/26/99", // + new String[] { "date_r0d", "09/26/99", "1999-09-26" }); + } + + @Test + public void testdate_r0e() { + // find 7-14 + testSingleCase("7-14 (AP)", new String[] { "date_r0e", "7-14", "XXXX-07-14" }); + } + + @Test + public void testdate_r0g() { + testSingleCase("1.3.99", // + new String[] { "date_r0g", "1.3.99", "1999-03-01" }); + } + + @Test + public void testdate_r0h() { + testSingleCase("1.3.1999", // + new String[] { "date_r0h", "1.3.1999", "1999-03-01" }); + } + + @Test + public void testdate_r1a() { + testSingleCase("February 25, 2009", // + new String[] { "date_r1a", "February 25, 2009", "2009-02-25" }); + testSingleCase("Feb. 25, 2009", // + new String[] { "date_r1a", "Feb. 25, 2009", "2009-02-25" }); + testSingleCase("Feb. 25, 2009, Monday", // + new String[] { "date_r1a", "Feb. 25, 2009, Monday", "2009-02-25" }); + } + + @Test + public void testdate_r1b() { + testSingleCase("25 February 2009", // + new String[] { "date_r1b", "25 February 2009", "2009-02-25" }); + testSingleCase("On 1 July 1913,", // + new String[] { "date_r1b", "1 July 1913", "1913-07-01" }); + } + + @Test + public void testdate_r1c() { + testSingleCase("25 of February 2009", // + new String[] { "date_r1c", "25 of February 2009", "2009-02-25" }); + } + + @Test + public void testdate_r2a() { + testSingleCase("November 19", // + new String[] { "date_r2a", "November 19", "XXXX-11-19" }); + testSingleCase("Nov 19", // + new String[] { "date_r2a", "Nov 19", "XXXX-11-19" }); + testSingleCase("January 19th", // + new String[] { "date_r2a", "January 19th", "XXXX-01-19" }); + testSingleCase("January nineteenth", // + new String[] { "date_r2a", "January nineteenth", "XXXX-01-19" }); + // Test with dct: + testSingleCase("Nov. 21", "19981102", // + new String[] { "date_r2a", "Nov. 21", "1998-11-21" }); + } + + @Test + public void testdate_r2b() { + testSingleCase("November 19-20", // + new String[] { "date_r2a", "November 19", "XXXX-11-19" }, // + new String[] { "date_r2b", "20", "XXXX-11-20" }); + } + + @Test + public void testdate_r2c() { + testSingleCase("19 November", // + new String[] { "date_r2c", "19 November", "XXXX-11-19" }); + testSingleCase("19 Nov", // + new String[] { "date_r2c", "19 Nov", "XXXX-11-19" }); + testSingleCase("19th of November", // + new String[] { "date_r2c", "19th of November", "XXXX-11-19" }); + } + + @Test + public void testdate_r2d() { + // find May 3 + testSingleCase("3 to 6 May", // + new String[] { "date_r2d", "3", "XXXX-05-03" }, // + new String[] { "date_r2c", "6 May", "XXXX-05-06" }); + } + + @Test + public void testdate_r2e() { + // find May 3, 2004 + testSingleCase("3 to 6 May 2004", // + new String[] { "date_r2e", "3", "2004-05-03" }, // + new String[] { "date_r1b", "6 May 2004", "2004-05-06" }); + } + + @Test + public void testdate_r2a2() { + testSingleCase("January 19th of that year", // + new String[] { "date_r2a2", "January 19th of that year", "XXXX-01-19" }); + } + + @Test + public void testdate_r2c2() { + testSingleCase("19th of January of the same year", // + new String[] { "date_r2c2", "19th of January of the same year", "XXXX-01-19" }); + } + + @Test + public void testdate_r3a() { + testSingleCase("Friday October 13", // + new String[] { "date_r3a", "Friday October 13", "XXXX-10-13" }); + testSingleCase("Monday, Oct 12", // + new String[] { "date_r3a", "Monday, Oct 12", "XXXX-10-12" }); + testSingleCase("Friday October 13 2009", // + new String[] { "date_r3b", "Friday October 13 2009", "2009-10-13" }); + testSingleCase("Monday, October 12th 2009", // + new String[] { "date_r3b", "Monday, October 12th 2009", "2009-10-12" }); + } + + @Test + public void testdate_r4ab() { + // find September 18 2010 + testSingleCase("September 14 and 18, 2010", // + new String[] { "date_r4a", "September 14", "2010-09-14" }, // + new String[] { "date_r4b", "18, 2010", "2010-09-18" }); + } + + @Test + public void testdate_r5a() { + testSingleCase("tomorrow", // + new String[] { "date_r5a", "tomorrow", "XXXX-XX-XX" }); + } + + @Test + public void testdate_r5b() { + testSingleCase("earlier yesterday", // + new String[] { "date_r5b", "earlier yesterday", "XXXX-XX-XX" }); + } + + @Test + public void testdate_r5c() { + testSingleCase("Monday", // + new String[] { "date_r5c", "Monday", "XXXX-XX-XX" }); + // Test with dct: + testSingleCase("Monday", "19981104", // + new String[] { "date_r5c", "Monday", "1998-11-02" }); + } + + @Test + public void testdate_r5d() { + testSingleCase("earlier Monday", // + new String[] { "date_r5d", "earlier Monday", "XXXX-XX-XX" }); + } + + @Test + public void testdate_r61() { + testSingleCase("the weekend", // + new String[] { "date_r61", "the weekend", "XXXX-WXX-WE" }); + } + + @Test + public void testdate_r7a() { + testSingleCase("November 2001", // + new String[] { "date_r7a", "November 2001", "2001-11" }); + testSingleCase("Nov. 2001", // + new String[] { "date_r7a", "Nov. 2001", "2001-11" }); + testSingleCase("February of 1999", // + new String[] { "date_r7a", "February of 1999", "1999-02" }); + } + + @Test + public void testdate_r7cd() { + // find May 2001 AND June 2011 + testSingleCase("May and June 2011", // + new String[] { "date_r7c", "May", "2011-05" }, // + new String[] { "date_r7d", "June 2011", "2011-06" }); + testSingleCase("May/June 2011", // + new String[] { "date_r7c", "May", "2011-05" }, // + new String[] { "date_r7d", "June 2011", "2011-06" }); + } + + @Test + public void testdate_r8a() { + testSingleCase("November next year", // + new String[] { "date_r8a", "November next year", "XXXX-11" }); + testSingleCase("May last year", // + new String[] { "date_r8a", "May last year", "XXXX-05" }); + } + + @Test + public void testdate_r9a() { + testSingleCase("summer", // + new String[] { "date_r9a", "summer", "XXXX-SU" }); + } + + @Test + public void testdate_r9b() { + testSingleCase("winter 2001", // + new String[] { "date_r9b", "winter 2001", "2001-WI" }); + testSingleCase("winter of 2001", // + new String[] { "date_r9b", "winter of 2001", "2001-WI" }); + } + + @Test + public void testdate_r9c() { + testSingleCase("summer of 69", // + new String[] { "date_r9c", "summer of 69", "1969-SU" }); + } + + @Test + public void testdate_r10a() { + testSingleCase("the third quarter of 2001", // + new String[] { "date_r10a", "the third quarter of 2001", "2001-Q3" }); + } + + // @Ignore("Disabled, false positives: shot a goal in the second half") + @Test + public void testdate_r10b() { + testSingleCase("the second half", // + new String[] { "date_r10b", "the second half", "XXXX-H2" }); + testSingleCase("the third-quarter", "2010-12-01", // + new String[] { "date_r10b", "the third-quarter", "2010-Q3" }); + } + + @Test + public void testdate_r10c() { + testSingleCase("the 2001 third quarter", // + new String[] { "date_r10c", "the 2001 third quarter", "2001-Q3" }); + } + + @Test + public void testdate_r11a() { + testSingleCase("this year's third quarter", // + new String[] { "date_r11a", "this year's third quarter", "XXXX-Q3" }); + testSingleCase("next year's first quarter", // + new String[] { "date_r11a", "next year's first quarter", "XXXX-Q1" }); + } + + @Test + public void testdate_r11b() { + // FIXME: this is supposed to match r11b, but is matched by date_r23a-relative + // As far as I can tell, they should both be good. + testSingleCase("the year-earlier first half", // + new String[] { "date_r23a", "the year-earlier first half", "XXXX-H1" }); + // new String[] { "date_r11b", "the year-earlier first half", "XXXX-H1" }); + } + + @Test + public void testdate_r11c() { + testSingleCase("the second half of this year", // + new String[] { "date_r11c", "the second half of this year", "XXXX-H2" }); + } + + @Test + public void testdate_r12a() { + testSingleCase("2009", // + new String[] { "date_r12a", "2009", "2009" }); + } + + @Test + public void testdate_r12b() { + testSingleCase("1850-58", // + new String[] { "date_r12a", "1850", "1850" }, // + new String[] { "date_r12b", "58", "1858" }); + } + + @Test + public void testdate_r12c() { + testSingleCase("nineteen ninety-one", // + new String[] { "date_r12c", "nineteen ninety-one", "1991" }); + } + + @Test + public void testdate_r12d() { + testSingleCase("two-thousand ten", // + new String[] { "date_r12d", "two-thousand ten", "2010" }); + } + + @Test + public void testdate_r12f() { + testSingleCase("1940/1941", // + new String[] { "date_r12f1", "1940", "1940" }, // + new String[] { "date_r12f2", "1941", "1941" }); + } + + @Test + public void testdate_r13a() { + testSingleCase("the 1990s", // + new String[] { "date_r13a", "the 1990s", "199" }); + } + + @Test + public void testdate_r13b() { + testSingleCase("the 90s", // + new String[] { "date_r13b", "the 90s", "199" }); + } + + @Test + public void testdate_r13c() { + testSingleCase("the seventies", // + new String[] { "date_r13c", "the seventies", "197" }); + } + + @Test + public void testdate_r13d() { + testSingleCase("the nineteen seventies", // + new String[] { "date_r13d", "the nineteen seventies", "197" }); + } + + @Test + public void testdate_r14a() { + testSingleCase("the early 1990s", // + new String[] { "date_r14a", "the early 1990s", "199" }); + } + + @Test + public void testdate_r14b() { + testSingleCase("the mid-90s", // + new String[] { "date_r14b", "the mid-90s", "199" }); + } + + @Test + public void testdate_r14c() { + testSingleCase("the late seventies", // + new String[] { "date_r14c", "the late seventies", "197" }); + } + + @Test + public void testdate_r14d() { + testSingleCase("the early nineteen seventies", // + new String[] { "date_r14d", "the early nineteen seventies", "197" }); + } + + @Test + public void testdate_r15a() { + testSingleCase("the 19th century", // + new String[] { "date_r15a", "the 19th century", "18" }); + testSingleCase("the seventh century", // + new String[] { "date_r15a", "the seventh century", "06" }); + } + + @Test + public void testdate_r15c() { + testSingleCase("19th and 20th century", // + new String[] { "date_r15c", "19th", "18" }, // + new String[] { "date_r15a", "20th century", "19" }); + } + + @Test + public void testdate_r15b() { + testSingleCase("19th and early 20th century", // + new String[] { "date_r15c", "19th", "18" }, // + new String[] { "date_r15b", "early 20th century", "19" }); + } + + @Test + public void testdate_r16a() { + testSingleCase("March", // + new String[] { "date_r16a", "March", "XXXX-03" }); + } + + @Test + public void testdate_r16b() { + testSingleCase("Early 2001", // + new String[] { "date_r16b", "Early 2001", "2001" }); + } + + @Test + public void testdate_r16c() { + testSingleCase("the beginning of November 1999", // + new String[] { "date_r16c", "the beginning of November 1999", "1999-11" }); + } + + @Test + public void testdate_r16d() { + testSingleCase("the middle of September", // + new String[] { "date_r16d", "the middle of September", "XXXX-09" }); + } + + @Test + public void testdate_r17a() { + testSingleCase("In 2010, this year", // + new String[] { "date_r12a", "2010", "2010" }, // + new String[] { "date_r17a", "this year", "2010" }); + } + + @Test + public void testdate_r17b() { + testSingleCase("In 1999, this November", // + new String[] { "date_r12a", "1999", "1999" }, // + new String[] { "date_r17b", "this November", "1999-11" }); + } + + @Test + public void testdate_r17c() { + testSingleCase("In 1998, this November 24", // + new String[] { "date_r12a", "1998", "1998" }, // + new String[] { "date_r17c", "this November 24", "1998-11-24" }); + } + + @Test + public void testdate_r17d() { + testSingleCase("this Monday", // + new String[] { "date_r17d", "this Monday", "XXXX-WXX-1" }); + } + + @Test + public void testdate_r17e() { + testSingleCase("this summer", // + new String[] { "date_r17e", "this summer", "XXXX-SU" }); + } + + @Test + public void testdate_r17f() { + testSingleCase("On November 24 1998, this day", // + new String[] { "date_r1a", "November 24 1998", "1998-11-24" }, // + new String[] { "date_r17f", "this day", "1998-11-24" }); + } + + @Test + public void testdate_r18a() { + testSingleCase("the beginning of this year", // + new String[] { "date_r18a", "the beginning of this year", "XXXX" }); + } + + @Test + public void testdate_r18b() { + testSingleCase("the beginning of this November", // + new String[] { "date_r18b", "the beginning of this November", "XXXX-11" }); + } + + @Test + public void testdate_r18c() { + testSingleCase("the beginning of this November 24", // + new String[] { "date_r18c", "the beginning of this November 24", "XXXX-11-24" }); + } + + @Test + public void testdate_r18d() { + testSingleCase("the beginning of this Monday", // + new String[] { "date_r18d", "the beginning of this Monday", "XXXX-WXX-1" }); + } + + @Test + public void testdate_r18e() { + testSingleCase("the beginning of this summer", // + new String[] { "date_r18e", "the beginning of this summer", "XXXX-SU" }); + } + + @Test + public void testdate_r19a() { + testSingleCase("at least several years ago", // + new String[] { "date_r19a", "at least several years ago", "PAST_REF" }); + } + + @Test + public void testdate_r19b() { + testSingleCase("In 2010, about twenty years ago", // + new String[] { "date_r12a", "2010", "2010" }, // + new String[] { "date_r19b", "about twenty years ago", "1990" }); + } + + @Test + public void testdate_r19c() { + testSingleCase("about 20 years ago", // + new String[] { "date_r19c", "about 20 years ago", "XXXX" }); + } + + @Test + public void testdate_r19d() { + testSingleCase("January 24 1998, a month ago", // + new String[] { "date_r1a", "January 24 1998", "1998-01-24" }, // + new String[] { "date_r19d", "a month ago", "1997-12" }); + } + + @Test + public void testdate_r20a() { + testSingleCase("some days later", // + new String[] { "date_r20a", "some days later", "FUTURE_REF" }); + } + + @Test + public void testdate_r20b() { + testSingleCase("about twenty days later", // + new String[] { "date_r20b", "about twenty days later", "XXXX-XX-XX" }); + } + + @Test + public void testdate_r20c() { + testSingleCase("about 20 days later", // + new String[] { "date_r20c", "about 20 days later", "XXXX-XX-XX" }); + } + + @Test + public void testdate_r20d() { + testSingleCase("December 29 1998, a week later", // + new String[] { "date_r1a", "December 29 1998", "1998-12-29" }, // + new String[] { "date_r20d", "a week later", "1999-01-05" }); + } + + @Test + public void testdate_r20f() { + testSingleCase("on 30 minutes something happened", // + new String[] { "date_r20f", "on 30 minutes", "UNDEF-REF-minute-PLUS-30" }); + } + + @Test + public void testdate_r20g() { + testSingleCase("on approximately thirty minutes something happened", // + new String[] { "date_r20g", "on approximately thirty minutes", "UNDEF-REF-minute-PLUS-30" }); + } + + @Test + public void testdate_r21a() { + testSingleCase("14 January 1998, twenty days earlier", // + new String[] { "date_r1b", "14 January 1998", "1998-01-14" }, // + new String[] { "date_r21a", "twenty days earlier", "1997-12-25" }); + } + + @Test + public void testdate_r21b() { + testSingleCase("14 January 1998, about 20 days earlier", // + new String[] { "date_r1b", "14 January 1998", "1998-01-14" }, // + new String[] { "date_r21b", "about 20 days earlier", "1997-12-25" }); + } + + @Test + public void testdate_r21c() { + testSingleCase("a week earlier", // + new String[] { "date_r21c", "a week earlier", "XXXX-WXX" }); + } + + @Test + public void testdate_r22a() { + testSingleCase("14 January 1998, a year ago", // + new String[] { "date_r1b", "14 January 1998", "1998-01-14" }, // + new String[] { "date_r22a", "a year ago", "1997-01-14" }); + testSingleCase("a year ago", // + new String[] { "date_r22a", "a year ago", "XXXX" }); + } + + @Test + public void testdate_r22b() { + testSingleCase("14 January 1998, a year later", // + new String[] { "date_r1b", "14 January 1998", "1998-01-14" }, // + new String[] { "date_r22b", "a year later", "1999-01-14" }); + } + + @Test + public void testdate_r23a() { + testSingleCase("the year-earlier first quarter", // + new String[] { "date_r23a", "the year-earlier first quarter", "XXXX-Q1" }); + testSingleCase("the year-earlier first quarter", "2010-12-01", // + new String[] { "date_r23a", "the year-earlier first quarter", "2009-Q1" }); + } + + @Test + public void testdate_r23b() { + testSingleCase("the year-earlier quarter", // + new String[] { "date_r23b", "the year-earlier quarter", "XXXX-XX" }); + } + + @Test + public void testdate_r23c() { + testSingleCase("the quarter", // + new String[] { "date_r23c", "the quarter", "XXXX-XX" }); + } + + @Test + public void testdate_r24a() { + testSingleCase("Christmas", // + new String[] { "date_r24a", "Christmas", "XXXX-12-25" }); + } + + @Test + public void testdate_r24b() { + testSingleCase("Christmas 2010", // + new String[] { "date_r24b", "Christmas 2010", "2010-12-25" }); + } + + @Test + public void testdate_r24cd() { + testSingleCase("Christmas 87", // + new String[] { "date_r24cd", "Christmas 87", "1987-12-25" }); + testSingleCase("Christmas '87", // + new String[] { "date_r24cd", "Christmas '87", "1987-12-25" }); + } + + @Test + public void testdate_r25a() { + testSingleCase("In 2010, on Easter Sunday", // + new String[] { "date_r12a", "2010", "2010" }, // + new String[] { "date_r25a", "Easter Sunday", "2010-04-04" }); + } + + @Test + public void testdate_r25b() { + testSingleCase("Easter Sunday 2010", // + new String[] { "date_r25b", "Easter Sunday 2010", "2010-04-04" }); + } + + @Test + public void testdate_r25cd() { + testSingleCase("Easter Sunday 87", // + new String[] { "date_r25cd", "Easter Sunday 87", "1987-04-19" }); + testSingleCase("Easter Sunday '87", // + new String[] { "date_r25cd", "Easter Sunday '87", "1987-04-19" }); + } + + @Test + public void testdate_r1a_negative() { + // do not match soon if it is in "as soon as" + testSingleCase("as soon as"); + } + + @Test + public void testdate_r2a_negative() { + // if it is a verb + testSingleCase("they march the way"); + } + + @Test + public void testdate_r2b_negative() { + // if it is a verb + testSingleCase("they march the way"); + } + + @Test + public void testdate_r2c_negative() { + // if it is a verb + testSingleCase("may"); + } + + @Test + public void testdate_r2d_negative() { + // or march, fall -- if it is lower case and without any further temporal stuff around it... + testSingleCase("may"); + } + + // FIXME: add POS information + @Ignore("Requires POS tagging") + @Test + public void testdate_r3a_negative() { + // four digit number followed by a plural noun + testSingleCase("2000 soldiers"); + } + + // FIXME: add POS information + @Ignore("Requires POS tagging") + @Test + public void testdate_r3b_negative() { + // four digit number followed by an adjective and a plural noun + testSingleCase("2000 dead soldiers"); + } + + @Test + public void testdate_r3c_negative() { + // four digit number followed a non-temporal unit + testSingleCase("2000 kilometer"); + } + + @Test + public void testdate_r4a_negative() { + testSingleCase("W2000.1920"); + testSingleCase("to 1462.93."); + } + + @Test + public void testx_date_r11a_negative() { + testSingleCase("in his 20s"); + } + + @Test + public void testTokenBoundaryFilter() { + testSingleCase("$2016 is not a date."); + testSingleCase("2016° is too hot"); + testSingleCase("1234.2016 or 2016.1234 are not a date either."); + testSingleCase("2016dimensional nonsense"); + testSingleCase("Okay: (2016).", // + new String[] { "date_r12a", "2016", "2016" }); + } + + @Test + public void testNextQuarter() { + testSingleCase("November 2015, 1 quarter later", new String[] { "date_r7a", "November 2015", "2015-11" }, new String[] { "date_r20c", "1 quarter later", "2016-Q1" }); + } +} diff --git a/test/de/unihd/dbs/heideltime/test/english/EnglishDurationTest.java b/test/de/unihd/dbs/heideltime/test/english/EnglishDurationTest.java new file mode 100644 index 00000000..29f13740 --- /dev/null +++ b/test/de/unihd/dbs/heideltime/test/english/EnglishDurationTest.java @@ -0,0 +1,87 @@ +package de.unihd.dbs.heideltime.test.english; + +import org.junit.Test; + +public class EnglishDurationTest extends AbstractHeideltimeTest { + @Test + public void testduration_r1ad() { + testSingleCase("less than sixty days", // + new String[] { "duration_r1a", "less than sixty days", "P60D" }); + testSingleCase("less than sixty minutes", // + new String[] { "duration_r1d", "less than sixty minutes", "PT1H" }); + } + + @Test + public void testduration_r1e12() { + testSingleCase("less than 60 days", // + new String[] { "duration_r1e1", "less than 60 days", "P60D" }); + testSingleCase("less than 60 minutes", // + new String[] { "duration_r1e2", "less than 60 minutes", "PT1H" }); + } + + @Test + public void testduration_r1cf() { + testSingleCase("several days", // + new String[] { "duration_r1c", "several days", "PXD" }); + testSingleCase("several minutes", // + new String[] { "duration_r1f", "several minutes", "PTXM" }); + } + + @Test + public void testduration_r2ad() { + testSingleCase("at least the last twenty years", // + new String[] { "duration_r2a", "at least the last twenty years", "P20Y" }); + testSingleCase("at least the last twenty minutes", // + new String[] { "duration_r2d", "at least the last twenty minutes", "PT20M" }); + } + + @Test + public void testduration_r2be() { + testSingleCase("at least the last 20 years", // + new String[] { "duration_r2b", "at least the last 20 years", "P20Y" }); + testSingleCase("at least the last 20 minutes", // + new String[] { "duration_r2e", "at least the last 20 minutes", "PT20M" }); + } + + @Test + public void testduration_r2cf() { + testSingleCase("at least the last several years", // + new String[] { "duration_r2c", "at least the last several years", "PXY" }); + testSingleCase("at least the last several minutes", // + new String[] { "duration_r2f", "at least the last several minutes", "PTXM" }); + } + + @Test + public void testduration_r3a() { + testSingleCase("a three-year period", // + new String[] { "duration_r3a", "a three-year period", "P3Y" }); + } + + @Test + public void testduration_r3b() { + testSingleCase("a 300 year period", // + new String[] { "duration_r3b", "a 300 year period", "P300Y" }); + } + + @Test + public void testduration_r5b1() { + testSingleCase("two and six days", // + new String[] { "duration_r5b1", "two", "P2D" }, // + new String[] { "duration_r1a", "six days", "P6D" }); + } + + @Test + public void testduration_r1a_negative() { + testSingleCase("about 200 years older"); // EMPTY! + } + + @Test + public void testduration_r1b_negative() { + testSingleCase("several days old"); // EMPTY! + } + + @Test + public void testduration_r1c_negative() { + testSingleCase("59-year-old"); // EMPTY! + } +} diff --git a/test/de/unihd/dbs/heideltime/test/english/EnglishIntervalTest.java b/test/de/unihd/dbs/heideltime/test/english/EnglishIntervalTest.java new file mode 100644 index 00000000..6d65ea03 --- /dev/null +++ b/test/de/unihd/dbs/heideltime/test/english/EnglishIntervalTest.java @@ -0,0 +1,106 @@ +package de.unihd.dbs.heideltime.test.english; + +import org.apache.uima.jcas.JCas; +import org.junit.Test; + +import de.unihd.dbs.heideltime.standalone.components.impl.StandaloneConfigContext; +import de.unihd.dbs.uima.annotator.intervaltagger.IntervalTagger; + +public class EnglishIntervalTest extends AbstractHeideltimeTest { + protected IntervalTagger intervaltagger; + + @Override + public void init() { + super.init(); + try { + intervaltagger = new IntervalTagger(); + StandaloneConfigContext aContext = new StandaloneConfigContext(); + + // construct a context for the uima engine + aContext.setConfigParameterValue(IntervalTagger.PARAM_LANGUAGE, "english"); + aContext.setConfigParameterValue(IntervalTagger.PARAM_INTERVALS, Boolean.TRUE); + aContext.setConfigParameterValue(IntervalTagger.PARAM_INTERVAL_CANDIDATES, Boolean.FALSE); + + intervaltagger.initialize(aContext); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + @Test + public void testinterval_01() { + testSingleCase("from 1999 to 2012", // + new String[] { "interval_01", "from 1999 to 2012" }); + } + + @Test + public void testinterval_02() { + testSingleCase("between March and May", // + new String[] { "interval_02", "between March and May" }); + } + + @Test + public void testinterval_03() { + testSingleCase("20.3.2003 - 1.5.2003", // + new String[] { "interval_03", "20.3.2003 - 1.5.2003" }); + } + + @Test + public void testinterval_04() { + testSingleCase("20.3.2003 to 1.5.2003", // + new String[] { "interval_04", "20.3.2003 to 1.5.2003" }); + } + + @Test + public void testinterval_05() { + testSingleCase("on 20.3.2003 the war began and it lasted until 1.5.2003", // + new String[] { "interval_05", "on 20.3.2003 the war began and it lasted until 1.5.2003" }); + } + + @Test + public void testinterval_06() { + testSingleCase("for December after leaving in February", // + new String[] { "interval_06", "for December after leaving in February" }); + } + + @Test + public void testinterval_07() { + testSingleCase("began on March 20 in 2003 and ended on May 1", // + new String[] { "interval_07", "began on March 20 in 2003 and ended on May 1" }); + } + + @Test + public void testinterval_08() { + testSingleCase("in 1999/2000", // + new String[] { "interval_08", "in 1999/2000" }); + } + + @Test + public void testinterval_09() { + testSingleCase("War ended in May, after fighting from March on", // + new String[] { "interval_09", "War ended in May, after fighting from March on" }); + } + + @Test + public void testinterval_10() { + testSingleCase("March, April and May", // + new String[] { "interval_10", "March, April and May" }); + } + + @Test + public void testinterval_11() { + testSingleCase("Monday, Thuesday, Wednesday and Thursday", // + new String[] { "interval_11", "Monday, Thuesday, Wednesday and Thursday" }); + } + + protected JCas analyze(String fragment) { + try { + JCas jcas = tokenize(fragment); + heideltime.process(jcas); + intervaltagger.process(jcas); + return jcas; + } catch (Exception e) { + throw new RuntimeException(e); + } + } +} diff --git a/test/de/unihd/dbs/heideltime/test/english/EnglishSetRules.java b/test/de/unihd/dbs/heideltime/test/english/EnglishSetRules.java new file mode 100644 index 00000000..2ec8a4c0 --- /dev/null +++ b/test/de/unihd/dbs/heideltime/test/english/EnglishSetRules.java @@ -0,0 +1,103 @@ +package de.unihd.dbs.heideltime.test.english; + +import org.junit.Test; + +public class EnglishSetRules extends AbstractHeideltimeTest { + @Test + public void testset_r1a() { + testSingleCase("each day", // + new String[] { "set_r1a", "each day", "P1D" }); + } + + @Test + public void testset_r1b() { + testSingleCase("every Monday", // + new String[] { "set_r1b", "every Monday", "XXXX-WXX-1" }); + } + + @Test + public void testset_r1c() { + testSingleCase("each September", // + new String[] { "set_r1c", "each September", "XXXX-09" }); + } + + @Test + public void testset_r1d() { + testSingleCase("every summer", // + new String[] { "set_r1d", "every summer", "XXXX-SU" }); + } + + @Test + public void testset_r2a() { + testSingleCase("once a week", // + new String[] { "set_r2a", "once a week", "P1W" }); + } + + @Test + public void testset_r2b() { + testSingleCase("twice a month", // + new String[] { "set_r2b", "twice a month", "P1M" }); + } + + @Test + public void testset_r2c() { + testSingleCase("three times a month", // + new String[] { "set_r2c", "three times a month", "P1M" }); + } + + @Test + public void testset_r2d() { + testSingleCase("40 times per month", // + new String[] { "set_r2d", "40 times per month", "P1M" }); + } + + @Test + public void testset_r2e() { + testSingleCase("a month", // + new String[] { "set_r2e", "a month", "P1M" }); + } + + @Test + public void testset_r2f() { + testSingleCase("a minute", // + new String[] { "set_r2f", "a minute", "PT1M" }); + } + + @Test + public void testset_r3a() { + testSingleCase("every 5 years", // + new String[] { "set_r3a", "every 5 years", "P5Y" }); + } + + @Test + public void testset_r3b() { + testSingleCase("every two days", // + new String[] { "set_r3b", "every two days", "P2D" }); + } + + @Test + public void testset_r4a() { + testSingleCase("2 days each week", // + new String[] { "set_r4a", "2 days each week", "P1W" }); + } + + @Test + public void testset_r5a() { + testSingleCase("annually", // + new String[] { "set_r5a", "annually", "XXXX" }); + } + + @Test + public void testset_r6a() { + testSingleCase("Monday afternoons", // + new String[] { "set_r6a", "Monday afternoons", "XXXX-WXX-1TAF" }); + } + + @Test + public void testset_r6b() { + // find: Monday nights + testSingleCase("Monday and Tuesday nights", // + new String[] { "set_r6b", "Monday", "XXXX-WXX-1TNI" }, // + new String[] { "set_r6a", "Tuesday nights", "XXXX-WXX-2TNI" }); + } +} diff --git a/test/de/unihd/dbs/heideltime/test/english/EnglishTimeTest.java b/test/de/unihd/dbs/heideltime/test/english/EnglishTimeTest.java new file mode 100644 index 00000000..91e74ac4 --- /dev/null +++ b/test/de/unihd/dbs/heideltime/test/english/EnglishTimeTest.java @@ -0,0 +1,199 @@ +package de.unihd.dbs.heideltime.test.english; + +import org.junit.Test; + +public class EnglishTimeTest extends AbstractHeideltimeTest { + @Test + public void testtime_r1a() { + testSingleCase("2009-12-19T17:00:00", // + new String[] { "time_r1a", "2009-12-19T17:00:00", "2009-12-19T17:00:00" }); + testSingleCase("2009-12-19 17:00:00", // + new String[] { "time_r1a", "2009-12-19 17:00:00", "2009-12-19T17:00:00" }); + } + + @Test + public void testtime_r1b() { + testSingleCase("2009-12-19T17:00", // + new String[] { "time_r1b", "2009-12-19T17:00", "2009-12-19T17:00" }); + } + + @Test + public void testtime_r1c() { + testSingleCase("12/29/2000 20:29", // + new String[] { "time_r1c", "12/29/2000 20:29", "2000-12-29T20:29" }); + } + + @Test + public void testtime_r1d() { + testSingleCase("12/29/2000 20:29:29", // + new String[] { "time_r1d", "12/29/2000 20:29:29", "2000-12-29T20:29:29" }); + } + + @Test + public void testtime_r1e() { + testSingleCase("12/29/2000 20:29:29.79", // + new String[] { "time_r1e", "12/29/2000 20:29:29.79", "2000-12-29T20:29:29.79" }); + } + + @Test + public void testtime_r2a() { + testSingleCase("09-24-99 1145EST", // + new String[] { "time_r2a", "09-24-99 1145EST", "1999-09-24T11:45-05" }); + } + + @Test + public void testtime_r2b() { + testSingleCase("November 24, 2011 1535 GMT", // + new String[] { "time_r2b", "November 24, 2011 1535 GMT", "2011-11-24T15:35" }); + } + + @Test + public void testtime_r2d() { + testSingleCase("Wed, 29 Dec 2004 00:28:16 +0000", // + new String[] { "time_r2d", "Wed, 29 Dec 2004 00:28:16 +0000", "2004-12-29T00:28:16+00" }); + testSingleCase("Sat, 29 Jan 2005 17:21:13 -0600", // + new String[] { "time_r2d", "Sat, 29 Jan 2005 17:21:13 -0600", "2005-01-29T17:21:13-06" }); + testSingleCase("1 Feb 2005 16:13:33 +1300", // + new String[] { "time_r2d", "1 Feb 2005 16:13:33 +1300", "2005-02-01T16:13:33+13" }); + } + + @Test + public void testtime_r3a() { + testSingleCase("midnight Monday", // + new String[] { "time_r3a", "midnight Monday", "XXXX-XX-XXT24:00" }); + // TODO: 'monday' is lost? + } + + @Test + public void testtime_r3b() { + testSingleCase("Monday night", // + new String[] { "time_r3b", "Monday night", "XXXX-XX-XXTNI" }); + // TODO: 'monday' is lost? + } + + @Test + public void testtime_r3b2() { + testSingleCase("early Friday morning", // + new String[] { "time_r3b2", "early Friday morning", "XXXX-XX-XXTMO" }); + // TODO: 'friday' is lost? + } + + @Test + public void testtime_r3c() { + testSingleCase("midnight today", // + new String[] { "time_r3c", "midnight today", "XXXX-XX-XXT24:00" }); + } + + @Test + public void testtime_r3d() { + testSingleCase("yesterday morning", // + new String[] { "time_r3d", "yesterday morning", "XXXX-XX-XXTMO" }); + } + + @Test + public void testtime_r3d2() { + testSingleCase("late yesterday evening", // + new String[] { "time_r3d2", "late yesterday evening", "XXXX-XX-XXTEV" }); + } + + @Test + public void testtime_r3e() { + testSingleCase("last Friday morning", // + new String[] { "time_r3e", "last Friday morning", "XXXX-XX-XXTMO" }); + // TODO: 'friday' is lost? + } + + @Test + public void testtime_r4a() { + testSingleCase("earlier this afternoon", // + new String[] { "time_r4a", "earlier this afternoon", "XXXX-XX-XXTAF" }); + testSingleCase("later last night", // + new String[] { "time_r4a", "later last night", "XXXX-XX-XXTNI" }); + } + + @Test + public void testtime_r4b() { + testSingleCase("tonight", // + new String[] { "time_r4b", "tonight", "XXXX-XX-XXTNI" }); + } + + @Test + public void testtime_r5a() { + testSingleCase("circa 9 a.m.", // + new String[] { "time_r5a", "circa 9 a.m.", "XXXX-XX-XXT09:00" }); + } + + @Test + public void testtime_r5b() { + testSingleCase("11 PM", // + new String[] { "time_r5b", "11 PM", "XXXX-XX-XXT23:00" }); + } + + @Test + public void testtime_r5c() { + testSingleCase("11:30 a.m.", // + new String[] { "time_r5c", "11:30 a.m.", "XXXX-XX-XXT11:30" }); + } + + @Test + public void testtime_r5d() { + testSingleCase("9:30 p.m.", // + new String[] { "time_r5d", "9:30 p.m.", "XXXX-XX-XXT21:30" }); + } + + @Test + public void testtime_r5e() { + testSingleCase("10:30:34 a.m.", // + new String[] { "time_r5e", "10:30:34 a.m.", "XXXX-XX-XXT10:30:34" }); + } + + @Test + public void testtime_r5f() { + testSingleCase("10:30:34 p.m.", // + new String[] { "time_r5f", "10:30:34 p.m.", "XXXX-XX-XXT22:30:34" }); + } + + @Test + public void testtime_r6a() { + testSingleCase("9 am Wednesday", // + new String[] { "time_r6a", "9 am Wednesday", "XXXX-XX-XXT09:00" }); + } + + @Test + public void testtime_r6b() { + testSingleCase("9 pm Wednesday", // + new String[] { "time_r6b", "9 pm Wednesday", "XXXX-XX-XXT21:00" }); + } + + @Test + public void testtime_r6c() { + testSingleCase("9:30 a.m. Wednesday", // + new String[] { "time_r6c", "9:30 a.m. Wednesday", "XXXX-XX-XXT09:30" }); + } + + @Test + public void testtime_r6d() { + testSingleCase("9:30 p.m. Wednesday", // + new String[] { "time_r6d", "9:30 p.m. Wednesday", "XXXX-XX-XXT21:30" }); + } + + @Test + public void testtime_r7a() { + testSingleCase("16:00 CET", // + new String[] { "time_r7a", "16:00 CET", "XXXX-XX-XXT16:00" }); + testSingleCase("1600 CET", // + new String[] { "time_r7a", "1600 CET", "XXXX-XX-XXT16:00" }); + } + + @Test + public void testtime_r8a() { + testSingleCase("the morning of April 18, 1775", // + new String[] { "time_r8a", "the morning of April 18, 1775", "1775-04-18TMO" }); + } + + @Test + public void testtime_r8b() { + testSingleCase("the morning of April 18", // + new String[] { "time_r8b", "the morning of April 18", "XXXX-04-18TMO" }); + } +} diff --git a/test/logback.xml b/test/logback.xml new file mode 100644 index 00000000..8179c447 --- /dev/null +++ b/test/logback.xml @@ -0,0 +1,12 @@ + + + + + %-5level %logger{36} - %msg%n + + + + + + + \ No newline at end of file diff --git a/test/test.props b/test/test.props new file mode 100644 index 00000000..ad97718a --- /dev/null +++ b/test/test.props @@ -0,0 +1,78 @@ +################################ +## MAIN ## +################################ +# Consideration of different timex3-types +# Date +considerDate = true + +# Duration +considerDuration = true + +# Set +considerSet = true + +# Time +considerTime = true + +# Temponyms (make sure you know what you do if you set this to "true") +considerTemponym = false + +################################### +# Path to TreeTagger home directory +################################### +# Ensure there is no white space in path (try to escape white spaces) +treeTaggerHome = SET ME IN CONFIG.PROPS! (e.g., /home/jannik/treetagger) +# This one is only necessary if you want to process chinese documents. +chineseTokenizerPath = SET ME IN CONFIG.PROPS! (e.g., /home/jannik/treetagger/chinese-tokenizer) + +################################## +# paths to JVnTextPro model paths: +################################## +sent_model_path = SET ME IN CONFIG.PROPS! (e.g., /home/jannik/jvntextpro/models/jvnsensegmenter) +word_model_path = SET ME IN CONFIG.PROPS! (e.g., /home/jannik/jvntextpro/models/jvnsegmenter) +pos_model_path = SET ME IN CONFIG.PROPS! (e.g., /home/jannik/jvntextpro/models/jvnpostag/maxent) + +##################################################### +# paths to Stanford POS Tagger model or config files: +##################################################### +model_path = SET ME IN CONFIG.PROPS! (e.g., /home/jannik/stanford-postagger-full-2014-01-04/models/arabic.tagger) +# leave this unset if you do not need one (e.g., /home/jannik/stanford-postagger-full-2014-01-04/tagger.config) +config_path = + +######################################## +## paths to hunpos and its tagger files: +######################################## +hunpos_path = SET ME IN CONFIG.PROPS! (e.g., /home/jannik/hunpos) +hunpos_model_name = SET ME IN CONFIG.PROPS! (e.g., model.hunpos.mte5.defnpout) + + + +# DO NOT CHANGE THE FOLLOWING +################################ +# Relative path of type system in HeidelTime home directory +typeSystemHome = desc/type/HeidelTime_TypeSystem.xml + +# Relative path of dkpro type system in HeidelTime home directory +typeSystemHome_DKPro = desc/type/DKPro_TypeSystem.xml + +# Name of uima-context variables... +# ...for date-consideration +uimaVarDate = Date + +# ...for duration-consideration +uimaVarDuration = Duration + +# ...for language +uimaVarLanguage = Language + +# ...for set-consideration +uimaVarSet = Set + +# ...for time-consideration +uimaVarTime = Time + +# ...for temponym-consideration +uimaVarTemponym = Temponym + +# ...for type to process +uimaVarTypeToProcess = Type