diff --git a/pom.xml b/pom.xml
index efa8c991..2474b218 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
com.github.heideltime
heideltime
- 2.2.1
+ 2.2.2-SNAPSHOT
HeidelTime
HeidelTime is a multilingual cross-domain temporal tagger that extracts temporal expressions from documents and normalizes them according to the TIMEX3 annotation standard.
@@ -24,6 +24,8 @@
UTF-8
+ true
+ true
@@ -50,6 +52,8 @@
src
${basedir}/class
+ test
+ ${basedir}/testclass
${basedir}
@@ -70,8 +74,17 @@
maven-compiler-plugin
3.1
-
- 1.7
+
+ 1.8
+
+
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+
+
+ ${project.build.directory}/lib
+
@@ -183,21 +196,21 @@
org.apache.uima
uimaj-core
- 2.8.1
+ 2.10.2
provided
edu.stanford.nlp
stanford-corenlp
- 3.3.1
+ 3.8.0
provided
args4j
args4j
- 2.32
+ 2.33
provided
@@ -206,5 +219,32 @@
0.1
provided
+
+
+ org.slf4j
+ slf4j-api
+ 1.7.25
+ provided
+
+
+
+ ch.qos.logback
+ logback-core
+ [1.2.3,)
+ provided
+
+
+ ch.qos.logback
+ logback-classic
+ [1.2.3,)
+ provided
+
+
+
+ junit
+ junit
+ [4.12,5)
+ test
+
diff --git a/resources/english/normalization/resources_normalization_normDay.txt b/resources/english/normalization/resources_normalization_normDay.txt
index 1c161e02..fbaa332d 100644
--- a/resources/english/normalization/resources_normalization_normDay.txt
+++ b/resources/english/normalization/resources_normalization_normDay.txt
@@ -1,52 +1,42 @@
// author: Jannik Strötgen
-// email: stroetgen@uni-hd.de
+// email: stroetgen@uni-hd\.de
// date: 2011-06-10
// This file contains "day words" and their normalized expressions
-// according to TIMEX3 format.
+// according to TIMEX3 format\.
// For example, the normalized value of "first" is "01"
// FORMAT: "day-word","normalized-day-word"
-"0","00"
-"00","00"
-"1","01"
-"01","01"
-"2","02"
-"02","02"
-"3","03"
-"03","03"
-"4","04"
-"04","04"
-"5","05"
-"05","05"
-"6","06"
-"06","06"
-"7","07"
-"07","07"
-"8","08"
-"08","08"
-"9","09"
-"09","09"
-"10","10"
-"11","11"
-"12","12"
-"13","13"
-"14","14"
-"15","15"
-"16","16"
-"17","17"
-"18","18"
-"19","19"
-"20","20"
-"21","21"
-"22","22"
-"23","23"
-"24","24"
-"25","25"
-"26","26"
-"27","27"
-"28","28"
-"29","29"
-"30","30"
-"31","31"
+"00?\.?","00"
+"0?1\.?","01"
+"0?2\.?","02"
+"0?3\.?","03"
+"0?4\.?","04"
+"0?5\.?","05"
+"0?6\.?","06"
+"0?7\.?","07"
+"0?8\.?","08"
+"0?9\.?","09"
+"10\.?","10"
+"11\.?","11"
+"12\.?","12"
+"13\.?","13"
+"14\.?","14"
+"15\.?","15"
+"16\.?","16"
+"17\.?","17"
+"18\.?","18"
+"19\.?","19"
+"20\.?","20"
+"21\.?","21"
+"22\.?","22"
+"23\.?","23"
+"24\.?","24"
+"25\.?","25"
+"26\.?","26"
+"27\.?","27"
+"28\.?","28"
+"29\.?","29"
+"30\.?","30"
+"31\.?","31"
//
"first","01"
"second","02"
@@ -115,15 +105,15 @@
"Thirtieth","30"
"Thirty-first","31"
//
-"1st","01"
-"2nd","02"
-"3rd","03"
-"4th","04"
-"5th","05"
-"6th","06"
-"7th","07"
-"8th","08"
-"9th","09"
+"0?1st","01"
+"0?2nd","02"
+"0?3rd","03"
+"0?4th","04"
+"0?5th","05"
+"0?6th","06"
+"0?7th","07"
+"0?8th","08"
+"0?9th","09"
"10th","10"
"11th","11"
"12th","12"
diff --git a/resources/english/normalization/resources_normalization_normDayInWeek.txt b/resources/english/normalization/resources_normalization_normDayInWeek.txt
index d0385a18..0c5817ce 100644
--- a/resources/english/normalization/resources_normalization_normDayInWeek.txt
+++ b/resources/english/normalization/resources_normalization_normDayInWeek.txt
@@ -19,3 +19,5 @@
"Friday","5"
"Saturday","6"
"Sunday","7"
+// Popular spelling mistakes
+"[Ww]e[dn][nd]e?sday","3"
diff --git a/resources/english/normalization/resources_normalization_normDurationNumber.txt b/resources/english/normalization/resources_normalization_normDurationNumber.txt
index c514cc9d..bab75f98 100644
--- a/resources/english/normalization/resources_normalization_normDurationNumber.txt
+++ b/resources/english/normalization/resources_normalization_normDurationNumber.txt
@@ -1,10 +1,10 @@
-// author: Jannik Strötgen
-// email: stroetgen@uni-hd.de
-// date: 2011-06-10
-// This file contains "duration numbers" and their normalized expressions
-// according to TIMEX3 format.
-// For example, the normalized value of "one" is "1"
-// FORMAT: "duration-number","normalized-duration-number"
+//[ -]?author: Jannik Strötgen
+//[ -]?email: stroetgen@uni-hd.de
+//[ -]?date: 2011-06-10
+//[ -]?This file contains "duration numbers" and their normalized expressions
+//[ -]?according to TIMEX3 format.
+//[ -]?For example, the normalized value of "one" is "1"
+//[ -]?FORMAT: "duration-number","normalized-duration-number"
"0","0"
"00","0"
"1","1"
@@ -47,7 +47,7 @@
"29","29"
"30","30"
"31","31"
-// normal numbers
+//[ -]?normal numbers
"one","1"
"two","2"
"three","3"
@@ -68,158 +68,86 @@
"eighteen","18"
"nineteen","19"
"twenty","20"
-"twenty-one","21"
-"twenty-two","22"
-"twenty-three","23"
-"twenty-four","24"
-"twenty-five","25"
-"twenty-six","26"
-"twenty-seven","27"
-"twenty-eight","28"
-"twenty-nine","29"
-"twenty one","21"
-"twenty two","22"
-"twenty three","23"
-"twenty four","24"
-"twenty five","25"
-"twenty six","26"
-"twenty seven","27"
-"twenty eight","28"
-"twenty nine","29"
+"twenty[ -]?one","21"
+"twenty[ -]?two","22"
+"twenty[ -]?three","23"
+"twenty[ -]?four","24"
+"twenty[ -]?five","25"
+"twenty[ -]?six","26"
+"twenty[ -]?seven","27"
+"twenty[ -]?eight","28"
+"twenty[ -]?nine","29"
"thirty","30"
-"thirty-one","31"
-"thirty-two","32"
-"thirty-three","33"
-"thirty-four","34"
-"thirty-five","35"
-"thirty-six","36"
-"thirty-seven","37"
-"thirty-eight","38"
-"thirty-nine","39"
-"thirty one","31"
-"thirty two","32"
-"thirty three","33"
-"thirty four","34"
-"thirty five","35"
-"thirty six","36"
-"thirty seven","37"
-"thirty eight","38"
-"thirty nine","39"
+"thirty[ -]?one","31"
+"thirty[ -]?two","32"
+"thirty[ -]?three","33"
+"thirty[ -]?four","34"
+"thirty[ -]?five","35"
+"thirty[ -]?six","36"
+"thirty[ -]?seven","37"
+"thirty[ -]?eight","38"
+"thirty[ -]?nine","39"
"forty","40"
-"forty-one","41"
-"forty-two","42"
-"forty-three","43"
-"forty-four","44"
-"forty-five","45"
-"forty-six","46"
-"forty-seven","47"
-"forty-eight","48"
-"forty-nine","49"
-"forty one","41"
-"forty two","42"
-"forty three","43"
-"forty four","44"
-"forty five","45"
-"forty six","46"
-"forty seven","47"
-"forty eight","48"
-"forty nine","49"
+"forty[ -]?one","41"
+"forty[ -]?two","42"
+"forty[ -]?three","43"
+"forty[ -]?four","44"
+"forty[ -]?five","45"
+"forty[ -]?six","46"
+"forty[ -]?seven","47"
+"forty[ -]?eight","48"
+"forty[ -]?nine","49"
"fifty","50"
-"fifty-one","51"
-"fifty-two","52"
-"fifty-three","53"
-"fifty-four","54"
-"fifty-five","55"
-"fifty-six","56"
-"fifty-seven","57"
-"fifty-eight","58"
-"fifty-nine","59"
-"fifty one","51"
-"fifty two","52"
-"fifty three","53"
-"fifty four","54"
-"fifty five","55"
-"fifty six","56"
-"fifty seven","57"
-"fifty eight","58"
-"fifty nine","59"
+"fifty[ -]?one","51"
+"fifty[ -]?two","52"
+"fifty[ -]?three","53"
+"fifty[ -]?four","54"
+"fifty[ -]?five","55"
+"fifty[ -]?six","56"
+"fifty[ -]?seven","57"
+"fifty[ -]?eight","58"
+"fifty[ -]?nine","59"
"sixty","60"
-"sixty-one","61"
-"sixty-two","62"
-"sixty-three","63"
-"sixty-four","64"
-"sixty-five","65"
-"sixty-six","66"
-"sixty-seven","67"
-"sixty-eight","68"
-"sixty-nine","69"
-"sixty one","61"
-"sixty two","62"
-"sixty three","63"
-"sixty four","64"
-"sixty five","65"
-"sixty six","66"
-"sixty seven","67"
-"sixty eight","68"
-"sixty nine","69"
+"sixty[ -]?one","61"
+"sixty[ -]?two","62"
+"sixty[ -]?three","63"
+"sixty[ -]?four","64"
+"sixty[ -]?five","65"
+"sixty[ -]?six","66"
+"sixty[ -]?seven","67"
+"sixty[ -]?eight","68"
+"sixty[ -]?nine","69"
"seventy","70"
-"seventy-one","71"
-"seventy-two","72"
-"seventy-three","73"
-"seventy-four","74"
-"seventy-five","75"
-"seventy-six","76"
-"seventy-seven","77"
-"seventy-eight","78"
-"seventy-nine","79"
-"seventy one","71"
-"seventy two","72"
-"seventy three","73"
-"seventy four","74"
-"seventy five","75"
-"seventy six","76"
-"seventy seven","77"
-"seventy eight","78"
-"seventy nine","79"
+"seventy[ -]?one","71"
+"seventy[ -]?two","72"
+"seventy[ -]?three","73"
+"seventy[ -]?four","74"
+"seventy[ -]?five","75"
+"seventy[ -]?six","76"
+"seventy[ -]?seven","77"
+"seventy[ -]?eight","78"
+"seventy[ -]?nine","79"
"eighty","80"
-"eighty-one","81"
-"eighty-two","82"
-"eighty-three","83"
-"eighty-four","84"
-"eighty-five","85"
-"eighty-six","86"
-"eighty-seven","87"
-"eighty-eight","88"
-"eighty-nine","89"
-"eighty one","81"
-"eighty two","82"
-"eighty three","83"
-"eighty four","84"
-"eighty five","85"
-"eighty six","86"
-"eighty seven","87"
-"eighty eight","88"
-"eighty nine","89"
+"eighty[ -]?one","81"
+"eighty[ -]?two","82"
+"eighty[ -]?three","83"
+"eighty[ -]?four","84"
+"eighty[ -]?five","85"
+"eighty[ -]?six","86"
+"eighty[ -]?seven","87"
+"eighty[ -]?eight","88"
+"eighty[ -]?nine","89"
"ninety","90"
-"ninety-one","91"
-"ninety-two","92"
-"ninety-three","93"
-"ninety-four","94"
-"ninety-five","95"
-"ninety-six","96"
-"ninety-seven","97"
-"ninety-eight","98"
-"ninety-nine","99"
-"ninety one","91"
-"ninety two","92"
-"ninety three","93"
-"ninety four","94"
-"ninety five","95"
-"ninety six","96"
-"ninety seven","97"
-"ninety eight","98"
-"ninety nine","99"
-// UPPER CASE
+"ninety[ -]?one","91"
+"ninety[ -]?two","92"
+"ninety[ -]?three","93"
+"ninety[ -]?four","94"
+"ninety[ -]?five","95"
+"ninety[ -]?six","96"
+"ninety[ -]?seven","97"
+"ninety[ -]?eight","98"
+"ninety[ -]?nine","99"
+//[ -]?UPPER CASE
"One","1"
"Two","2"
"Three","3"
@@ -240,155 +168,83 @@
"Eighteen","18"
"Nineteen","19"
"Twenty","20"
-"Twenty-one","21"
-"Twenty-two","22"
-"Twenty-three","23"
-"Twenty-four","24"
-"Twenty-five","25"
-"Twenty-six","26"
-"Twenty-seven","27"
-"Twenty-eight","28"
-"Twenty-nine","29"
-"Twenty one","21"
-"Twenty two","22"
-"Twenty three","23"
-"Twenty four","24"
-"Twenty five","25"
-"Twenty six","26"
-"Twenty seven","27"
-"Twenty eight","28"
-"Twenty nine","29"
+"Twenty[ -]?one","21"
+"Twenty[ -]?two","22"
+"Twenty[ -]?three","23"
+"Twenty[ -]?four","24"
+"Twenty[ -]?five","25"
+"Twenty[ -]?six","26"
+"Twenty[ -]?seven","27"
+"Twenty[ -]?eight","28"
+"Twenty[ -]?nine","29"
"Thirty","30"
-"Thirty-one","31"
-"Thirty-two","32"
-"Thirty-three","33"
-"Thirty-four","34"
-"Thirty-five","35"
-"Thirty-six","36"
-"Thirty-seven","37"
-"Thirty-eight","38"
-"Thirty-nine","39"
-"Thirty one","31"
-"Thirty two","32"
-"Thirty three","33"
-"Thirty four","34"
-"Thirty five","35"
-"Thirty six","36"
-"Thirty seven","37"
-"Thirty eight","38"
-"Thirty nine","39"
+"Thirty[ -]?one","31"
+"Thirty[ -]?two","32"
+"Thirty[ -]?three","33"
+"Thirty[ -]?four","34"
+"Thirty[ -]?five","35"
+"Thirty[ -]?six","36"
+"Thirty[ -]?seven","37"
+"Thirty[ -]?eight","38"
+"Thirty[ -]?nine","39"
"Forty","40"
-"Forty-one","41"
-"Forty-two","42"
-"Forty-three","43"
-"Forty-four","44"
-"Forty-five","45"
-"Forty-six","46"
-"Forty-seven","47"
-"Forty-eight","48"
-"Forty-nine","49"
-"Forty one","41"
-"Forty two","42"
-"Forty three","43"
-"Forty four","44"
-"Forty five","45"
-"Forty six","46"
-"Forty seven","47"
-"Forty eight","48"
-"Forty nine","49"
+"Forty[ -]?one","41"
+"Forty[ -]?two","42"
+"Forty[ -]?three","43"
+"Forty[ -]?four","44"
+"Forty[ -]?five","45"
+"Forty[ -]?six","46"
+"Forty[ -]?seven","47"
+"Forty[ -]?eight","48"
+"Forty[ -]?nine","49"
"Fifty","50"
-"Fifty-one","51"
-"Fifty-two","52"
-"Fifty-three","53"
-"Fifty-four","54"
-"Fifty-five","55"
-"Fifty-six","56"
-"Fifty-seven","57"
-"Fifty-eight","58"
-"Fifty-nine","59"
-"Fifty one","51"
-"Fifty two","52"
-"Fifty three","53"
-"Fifty four","54"
-"Fifty five","55"
-"Fifty six","56"
-"Fifty seven","57"
-"Fifty eight","58"
-"Fifty nine","59"
+"Fifty[ -]?one","51"
+"Fifty[ -]?two","52"
+"Fifty[ -]?three","53"
+"Fifty[ -]?four","54"
+"Fifty[ -]?five","55"
+"Fifty[ -]?six","56"
+"Fifty[ -]?seven","57"
+"Fifty[ -]?eight","58"
+"Fifty[ -]?nine","59"
"Sixty","60"
-"Sixty-one","61"
-"Sixty-two","62"
-"Sixty-three","63"
-"Sixty-four","64"
-"Sixty-five","65"
-"Sixty-six","66"
-"Sixty-seven","67"
-"Sixty-eight","68"
-"Sixty-nine","69"
-"Sixty one","61"
-"Sixty two","62"
-"Sixty three","63"
-"Sixty four","64"
-"Sixty five","65"
-"Sixty six","66"
-"Sixty seven","67"
-"Sixty eight","68"
-"Sixty nine","69"
+"Sixty[ -]?one","61"
+"Sixty[ -]?two","62"
+"Sixty[ -]?three","63"
+"Sixty[ -]?four","64"
+"Sixty[ -]?five","65"
+"Sixty[ -]?six","66"
+"Sixty[ -]?seven","67"
+"Sixty[ -]?eight","68"
+"Sixty[ -]?nine","69"
"Seventy","70"
-"Seventy-one","71"
-"Seventy-two","72"
-"Seventy-three","73"
-"Seventy-four","74"
-"Seventy-five","75"
-"Seventy-six","76"
-"Seventy-seven","77"
-"Seventy-eight","78"
-"Seventy-nine","79"
-"Seventy one","71"
-"Seventy two","72"
-"Seventy three","73"
-"Seventy four","74"
-"Seventy five","75"
-"Seventy six","76"
-"Seventy seven","77"
-"Seventy eight","78"
-"Seventy nine","79"
+"Seventy[ -]?one","71"
+"Seventy[ -]?two","72"
+"Seventy[ -]?three","73"
+"Seventy[ -]?four","74"
+"Seventy[ -]?five","75"
+"Seventy[ -]?six","76"
+"Seventy[ -]?seven","77"
+"Seventy[ -]?eight","78"
+"Seventy[ -]?nine","79"
"Eighty","80"
-"Eighty-one","81"
-"Eighty-two","82"
-"Eighty-three","83"
-"Eighty-four","84"
-"Eighty-five","85"
-"Eighty-six","86"
-"Eighty-seven","87"
-"Eighty-eight","88"
-"Eighty-nine","89"
-"Eighty one","81"
-"Eighty two","82"
-"Eighty three","83"
-"Eighty four","84"
-"Eighty five","85"
-"Eighty six","86"
-"Eighty seven","87"
-"Eighty eight","88"
-"Eighty nine","89"
+"Eighty[ -]?one","81"
+"Eighty[ -]?two","82"
+"Eighty[ -]?three","83"
+"Eighty[ -]?four","84"
+"Eighty[ -]?five","85"
+"Eighty[ -]?six","86"
+"Eighty[ -]?seven","87"
+"Eighty[ -]?eight","88"
+"Eighty[ -]?nine","89"
"Ninety","90"
-"Ninety-one","91"
-"Ninety-two","92"
-"Ninety-three","93"
-"Ninety-four","94"
-"Ninety-five","95"
-"Ninety-six","96"
-"Ninety-seven","97"
-"Ninety-eight","98"
-"Ninety-nine","99"
-"Ninety one","91"
-"Ninety two","92"
-"Ninety three","93"
-"Ninety four","94"
-"Ninety five","95"
-"Ninety six","96"
-"Ninety seven","97"
-"Ninety eight","98"
-"Ninety nine","99"
+"Ninety[ -]?one","91"
+"Ninety[ -]?two","92"
+"Ninety[ -]?three","93"
+"Ninety[ -]?four","94"
+"Ninety[ -]?five","95"
+"Ninety[ -]?six","96"
+"Ninety[ -]?seven","97"
+"Ninety[ -]?eight","98"
+"Ninety[ -]?nine","99"
diff --git a/resources/english/normalization/resources_normalization_normHolidayFix.txt b/resources/english/normalization/resources_normalization_normHolidayFix.txt
index 524ecf1a..c5928ec8 100644
--- a/resources/english/normalization/resources_normalization_normHolidayFix.txt
+++ b/resources/english/normalization/resources_normalization_normHolidayFix.txt
@@ -8,62 +8,48 @@
// the reference of the values is given in the repattern file
// http://en.wikipedia.org/wiki/New_Year%27s_Day
-"New Year's Day","01-01"
+"New Year'?s [Dd]ay","01-01"
// http://en.wikipedia.org/wiki/Groundhog_Day
-"Groundhog Day","02-02"
+"Groundhog [Dd]ay","02-02"
// http://en.wikipedia.org/wiki/Valentine%27s_Day
-"Valentines Day","02-14"
-"Saint Valentines Day","02-14"
-"St. Valentines Day","02-14"
-"Valentine's Day","02-14"
-"Saint Valentine's Day","02-14"
-"St. Valentine's Day","02-14"
-"Valentines' Day","02-14"
-"Saint Valentines' Day","02-14"
-"St. Valentines' Day","02-14"
+"(?:Saint |St\. )?Valentine'?s'? [Dd]ay","02-14"
// http://en.wikipedia.org/wiki/Patriots%27_Day
-"Patriots Day","09-11"
-"Patriots' Day","09-11"
-"Patriot's Day","09-11"
+"Patriot'?s'? [Dd]ay","09-11"
// http://en.wikipedia.org/wiki/German-American_Day
-"German-American Day","10-06"
+"German-American [Dd]ay","10-06"
// http://en.wikipedia.org/wiki/White_Cane_Safety_Day
-"White Cane Safety Day","10-15"
+"White Cane Safety [Dd]ay","10-15"
// http://en.wikipedia.org/wiki/Boss%27s_Day
-"Boss's Day","10-16"
-"Bosses Day","10-16"
-"Bosses' Day","10-16"
+"Boss(?:'s|es|es') [Dd]ay","10-16"
//won't work, because of the "$" in the string
-//"The Bo$$ Day","10-16"
-"National Boss Day","10-16"
+"The Bo\$\$ [Dd]ay","10-16"
+"National Boss [Dd]ay","10-16"
// http://en.wikipedia.org/wiki/Independence_Day_%28US%29
-"Independence Day","07-04"
+"Independence [Dd]ay","07-04"
"Fourth of July","07-04"
// http://en.wikipedia.org/wiki/Veterans_Day
-"Veterans Day","11-11"
-"Armistice Day","11-11"
+"Veterans [Dd]ay","11-11"
+"Armistice [Dd]ay","11-11"
// http://en.wikipedia.org/wiki/Remembrance_Day
-"Remembrance Day","11-11"
-"Poppy Day","11-11"
+"Remembrance [Dd]ay","11-11"
+"Poppy [Dd]ay","11-11"
// http://en.wikipedia.org/wiki/Epiphany_%28holiday%29
"Epiphany","02-06"
"Theophany","02-06"
// http://en.wikipedia.org/wiki/Hallowe%27en
-"Halloween","10-31"
-"All Hallows’ Evening","10-31"
-"Hallowe'en","10-31"
-"All Hallows' Eve","10-31"
+"Hallowe'?en","10-31"
+"All Hallows' [Ee]ve(?:ning)?","10-31"
// http://en.wikipedia.org/wiki/Assumption_of_Mary
"Assumption of Mary","08-15"
@@ -71,69 +57,37 @@
"The Assumption","08-15"
// http://en.wikipedia.org/wiki/Reformation_Day
-"Reformation Day","10-31"
+"Reformation [Dd]ay","10-31"
// http://en.wikipedia.org/wiki/All_Saints%27_Day
-"All Saints","11-01"
-"All Saints' Day","11-01"
+"All Saints'?(?: [Dd]ay)?","11-01"
"Solemnity of All Saints","11-01"
"All Hallows","11-01"
"Hallowmas","11-01"
// http://en.wikipedia.org/wiki/Christmas
-"Christmas Eve","12-24"
-
-"Christmas","12-25"
-"Christmas Day","12-25"
-"Xmas","12-25"
-"XMAS","12-25"
+"Christmas [Ee]ve","12-24"
+"Christmas(?: [Dd]ay)?","12-25"
+"X-?(?:mas|MAS)","12-25"
"Noel","12-25"
"Yule","12-25"
// http://en.wikipedia.org/wiki/New_Year%27s_Eve
-"New Year's Eve","12-31"
+"New Year'?s [Ee]ve","12-31"
"Hogmanay","12-31"
"Calennig","12-31"
// http://en.wikipedia.org/wiki/May_Day
-"May Day","05-01"
-"International Workers Day","05-01"
-"International Worker's Day","05-01"
-"International Workers' Day","05-01"
+"May [Dd]ay","05-01"
+"International Worker'?s'? [Dd]ay","05-01"
// http://en.wikipedia.org/wiki/Boxing_Day
-"Boxing Day","12-26"
+"Boxing [Dd]ay","12-26"
// http://en.wikipedia.org/wiki/St._Patrick%27s_Day
-"Saint Patricks Day","03-17"
-"St. Patricks Day","03-17"
-"Saint Patricks Day","03-17"
-"St. Paddys Day","03-17"
-"Saint Paddys Day","03-17"
-"St. Pattys Day","03-17"
-"Saint Pattys Day","03-17"
-
-"Saint Patrick's Day","03-17"
-"St. Patrick's Day","03-17"
-"Saint Patrick's Day","03-17"
-"St. Paddy's Day","03-17"
-"Saint Paddy's Day","03-17"
-"St. Patty's Day","03-17"
-"Saint Patty's Day","03-17"
-
-"Saint Patricks' Day","03-17"
-"St. Patricks' Day","03-17"
-"Saint Patricks' Day","03-17"
-"St. Paddys' Day","03-17"
-"Saint Paddys' Day","03-17"
-"St. Pattys' Day","03-17"
-"Saint Pattys' Day","03-17"
+"(?:Saint|St\.) Pa(?:trick|ddy|tty)'?s'? [Dd]ay","03-17"
// http://en.wikipedia.org/wiki/St._Andrew%27s_Day
-"Saint Andrews Day","11-30"
-"Saint Andrew's Day","11-30"
-"Saint Andrews' Day","11-30"
-"St. Andrews Day","11-30"
-"St. Andrew's Day","11-30"
-"St. Andrews' Day","11-30"
+"(?:Saint|St\.) Andrew'?s'? [Dd]ay","11-30"
+
diff --git a/resources/english/normalization/resources_normalization_normHolidayVar.txt b/resources/english/normalization/resources_normalization_normHolidayVar.txt
index 28422976..6acc3863 100644
--- a/resources/english/normalization/resources_normalization_normHolidayVar.txt
+++ b/resources/english/normalization/resources_normalization_normHolidayVar.txt
@@ -22,9 +22,9 @@
// http://en.wikipedia.org/wiki/Maundy_Thursday
"Maundy Thursday","00-00 funcDateCalc(EasterSunday(YEAR, -3))"
+"Great & Holy Thursday","00-00 funcDateCalc(EasterSunday(YEAR, -3))"
"Holy Thursday","00-00 funcDateCalc(EasterSunday(YEAR, -3))"
"Covenant Thursday","00-00 funcDateCalc(EasterSunday(YEAR, -3))"
-"Great & Holy Thursday","00-00 funcDateCalc(EasterSunday(YEAR, -3))"
"Thursday of Mysteries","00-00 funcDateCalc(EasterSunday(YEAR, -3))"
// http://en.wikipedia.org/wiki/Good_Friday
@@ -42,23 +42,17 @@
"Joyous Saturday","00-00 funcDateCalc(EasterSunday(YEAR, -1))"
// http://en.wikipedia.org/wiki/Easter
-"Easter","00-00 funcDateCalc(EasterSunday(YEAR, 0))"
-"Easter Day","00-00 funcDateCalc(EasterSunday(YEAR, 0))"
-"Easter Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 0))"
-"Resurrection Day","00-00 funcDateCalc(EasterSunday(YEAR, 0))"
-"Resurrection Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 0))"
+"Easter(?: ?[Ss]unday| ?[Dd]ay)?","00-00 funcDateCalc(EasterSunday(YEAR, 0))"
+"Resurrection(?: ?[Ss]unday| ?[Dd]ay)?","00-00 funcDateCalc(EasterSunday(YEAR, 0))"
// http://en.wikipedia.org/wiki/Easter_Monday
"Easter Monday","00-00 funcDateCalc(EasterSunday(YEAR, 1))"
// http://en.wikipedia.org/wiki/Octave_of_Easter
-"Octave of Easter","00-00 funcDateCalc(EasterSunday(YEAR, 7))"
-"Octave Day of Easter","00-00 funcDateCalc(EasterSunday(YEAR, 7))"
+"Octave(?: [Dd]ay)? of Easter","00-00 funcDateCalc(EasterSunday(YEAR, 7))"
"Low Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 7))"
-"Saint Thomas Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 7))"
-"St. Thomas Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 7))"
-"Quasimodo Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 7))"
-"Quasimodogeniti","00-00 funcDateCalc(EasterSunday(YEAR, 7))"
+"(?:Saint|St\.) Thomas Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 7))"
+"Quasimodo(?: Sunday|geniti)","00-00 funcDateCalc(EasterSunday(YEAR, 7))"
"Second Sunday of Easter","00-00 funcDateCalc(EasterSunday(YEAR, 7))"
"Divine Mercy Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 7))"
@@ -68,65 +62,55 @@
"Ascension Thursday","00-00 funcDateCalc(EasterSunday(YEAR, 39))"
// http://en.wikipedia.org/wiki/Father%27s_Day
-"Father's Day","06-00 funcDateCalc(EasterSunday(YEAR, 39))"
+"Father'?s'? [Dd]ay","06-00 funcDateCalc(EasterSunday(YEAR, 39))"
// http://en.wikipedia.org/wiki/Reformation_Day
"Reformation Sunday","10-00 funcDateCalc(WeekdayRelativeTo(YEAR-10-31, 1, -1, true))"
// http://en.wikipedia.org/wiki/Pentecost
"Pentecost","00-00 funcDateCalc(EasterSunday(YEAR, 49))"
-"Whit Sunday","00-00 funcDateCalc(EasterSunday(YEAR, 49))"
-"Whitsun","00-00 funcDateCalc(EasterSunday(YEAR, 49))"
-"Whit","00-00 funcDateCalc(EasterSunday(YEAR, 49))"
+"Whit(?:sun| ?[Ss]unday)?","00-00 funcDateCalc(EasterSunday(YEAR, 49))"
// http://en.wikipedia.org/wiki/Corpus_Christi_%28feast%29
"Corpus Christi","00-00 funcDateCalc(EasterSunday(YEAR, 60))"
"The Most Holy Body and Blood of Christ","00-00 funcDateCalc(EasterSunday(YEAR, 60))"
// advent sundays: the four sundays before christmas(12-24)
-"1. Advent","00-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -4, false))"
-"1st Advent","00-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -4, false))"
+"1(?:st|\.)? Advent","00-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -4, false))"
"first Advent","00-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -4, false))"
-"2. Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -3, false))"
-"2nd Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -3, false))"
+"2(?:nd|\.)? Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -3, false))"
"second Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -3, false))"
-"3. Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -2, false))"
-"3rd Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -2, false))"
+"3(?:rd|\.)? Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -2, false))"
"third Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -2, false))"
-"4. Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -1, false))"
-"4th Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -1, false))"
+"4(?:th|\.)? Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -1, false))"
"fourth Advent","12-00 funcDateCalc(WeekdayRelativeTo(YEAR-12-24, 1, -1, false))"
// http://en.wikipedia.org/wiki/Black_Friday_%28shopping%29
"Black Friday","11-00 funcDateCalc(WeekdayRelativeTo(YEAR-11-01, 6, 4, true))"
// http://en.wikipedia.org/wiki/Martin_Luther_King,_Jr._Day
-"Birthday of Martin Luther King, Jr.","01-00 funcDateCalc(WeekdayRelativeTo(YEAR-01-01, 2, 3, true))"
-"Martin Luther King Day","01-00 funcDateCalc(WeekdayRelativeTo(YEAR-01-01, 2, 3, true))"
-"Martin Luther King, Jr. Day","01-00 funcDateCalc(WeekdayRelativeTo(YEAR-01-01, 2, 3, true))"
+"Birthday of Martin Luther King, Jr\.","01-00 funcDateCalc(WeekdayRelativeTo(YEAR-01-01, 2, 3, true))"
+"Martin Luther King(?:, Jr\.)? [Dd]ay","01-00 funcDateCalc(WeekdayRelativeTo(YEAR-01-01, 2, 3, true))"
// http://en.wikipedia.org/wiki/Washington%27s_Birthday
-"Presidents Day","02-00 funcDateCalc(WeekdayRelativeTo(YEAR-02-01, 2, 3, true))"
-"Presidents' Day","02-00 funcDateCalc(WeekdayRelativeTo(YEAR-02-01, 2, 3, true))"
-"President's Day","02-00 funcDateCalc(WeekdayRelativeTo(YEAR-02-01, 2, 3, true))"
+"President'?s'? [Dd]ay","02-00 funcDateCalc(WeekdayRelativeTo(YEAR-02-01, 2, 3, true))"
"Washington's Birthday","02-00 funcDateCalc(WeekdayRelativeTo(YEAR-02-01, 2, 3, true))"
// http://en.wikipedia.org/wiki/Memorial_Day
-"Memorial Day","05-00 funcDateCalc(WeekdayRelativeTo(YEAR-06-01, 2, -1, false))"
-"Decoration Day","05-00 funcDateCalc(WeekdayRelativeTo(YEAR-06-01, 2, -1, false))"
+"Memorial [Dd]ay","05-00 funcDateCalc(WeekdayRelativeTo(YEAR-06-01, 2, -1, false))"
+"Decoration [Dd]ay","05-00 funcDateCalc(WeekdayRelativeTo(YEAR-06-01, 2, -1, false))"
// http://en.wikipedia.org/wiki/Labor_Day
-"Labor Day","09-00 funcDateCalc(WeekdayRelativeTo(YEAR-09-01, 2, 1, true))"
+"Labor [Dd]ay","09-00 funcDateCalc(WeekdayRelativeTo(YEAR-09-01, 2, 1, true))"
// http://en.wikipedia.org/wiki/Columbus_Day
-"Columbus Day","10-00 funcDateCalc(WeekdayRelativeTo(YEAR-10-01, 2, 2, true))"
+"Columbus [Dd]ay","10-00 funcDateCalc(WeekdayRelativeTo(YEAR-10-01, 2, 2, true))"
// http://en.wikipedia.org/wiki/Thanksgiving_%28United_States%29
-"Thanksgiving","11-00 funcDateCalc(WeekdayRelativeTo(YEAR-11-01, 5, 4, true))"
-"Thanksgiving Day","11-00 funcDateCalc(WeekdayRelativeTo(YEAR-11-01, 5, 4, true))"
+"Thanksgiving(?: [Dd]ay)?","11-00 funcDateCalc(WeekdayRelativeTo(YEAR-11-01, 5, 4, true))"
// http://en.wikipedia.org/wiki/Mother%27s_Day
-"Mother's Day","05-00 funcDateCalc(WeekdayRelativeTo(YEAR-05-01, 1, 2, true))"
+"Mother'?s'? [Dd]ay","05-00 funcDateCalc(WeekdayRelativeTo(YEAR-05-01, 1, 2, true))"
diff --git a/resources/english/normalization/resources_normalization_normPartOfYear.txt b/resources/english/normalization/resources_normalization_normPartOfYear.txt
index 1edc8e19..420e3e67 100644
--- a/resources/english/normalization/resources_normalization_normPartOfYear.txt
+++ b/resources/english/normalization/resources_normalization_normPartOfYear.txt
@@ -6,19 +6,13 @@
// For example, the normalized value of "first quarter" is "Q1"
// FORMAT: "parts-of-year-word","normalized-parts-of-year-word"
// First Quarter
-"([Ff]iscal(-| ))?([Ff]irst|1st)(-| )quarter","Q1"
-"([Ff]iscal(-| ))?([Ss]econd|2nd)(-| )quarter","Q2"
-"([Ff]iscal(-| ))?([Tt]hird|3rd)(-| )quarter","Q3"
-"([Ff]iscal(-| ))?([Ff]ou?rth|4th)(-| )quarter","Q4"
-"last-quarter","Q4"
-"Last-quarter","Q4"
-"last quarter","Q4"
-"Last quarter","Q4"
+"([Ff]iscal[ -])?([Ff]irst|1st)[ -]quarter","Q1"
+"([Ff]iscal[ -])?([Ss]econd|2nd)[ -]quarter","Q2"
+"([Ff]iscal[ -])?([Tt]hird|3rd)[ -]quarter","Q3"
+"([Ff]iscal[ -])?([Ff]ou?rth|4th)[ -]quarter","Q4"
+"[Ll]ast[ -]quarter","Q4"
// First Half
-"([Ff]iscal(-| ))?([Ff]irst|1st)(-| )half","H1"
-"([Ff]iscal(-| ))?([Ss]econd|2nd)(-| )half","H2"
+"([Ff]iscal[ -])?([Ff]irst|1st)[ -]half","H1"
// Second Half
-"last-half","H2"
-"Last-half","H2"
-"last half","H2"
-"Last half","H2"
+"([Ff]iscal[ -])?([Ss]econd|2nd)[ -]half","H2"
+"[Ll]ast[ -]half","H2"
diff --git a/resources/english/normalization/resources_normalization_normPartWords.txt b/resources/english/normalization/resources_normalization_normPartWords.txt
index 3bc0b0bd..48c06f26 100644
--- a/resources/english/normalization/resources_normalization_normPartWords.txt
+++ b/resources/english/normalization/resources_normalization_normPartWords.txt
@@ -7,12 +7,20 @@
// FORMAT: "part word","normalized-part-word"
"The middle of","MID"
"the middle of","MID"
+"Middle of","MID"
+"middle of","MID"
"The end of","END"
"the end of","END"
+"End of","END"
+"end of","END"
"The beginning of","START"
"the beginning of","START"
+"Beginning of","START"
+"beginning of","START"
"The start of","START"
"the start of","START"
+"Start of","START"
+"start of","START"
"Late","END"
"late","END"
"Later","END"
@@ -28,4 +36,6 @@
"Fiscal-",""
"fiscal-",""
"Fiscal",""
-"fiscal",""
\ No newline at end of file
+"fiscal",""
+"Dawn of","START"
+"dawn of","START"
diff --git a/resources/english/normalization/resources_normalization_normUnit.txt b/resources/english/normalization/resources_normalization_normUnit.txt
index 306692f0..5fbfd570 100644
--- a/resources/english/normalization/resources_normalization_normUnit.txt
+++ b/resources/english/normalization/resources_normalization_normUnit.txt
@@ -4,28 +4,17 @@
// This file contains "unit words" and their normalized expressions.
// For example, the normalized value of "week" is "week"
// FORMAT: "unit-word","normalized-unit-word"
-"[Dd]ay","day"
-"[Ww]eek-end","week-WE"
-"[Ww]eekend","week-WE"
-"[Ww]eek","week"
-"[Mm]onth","month"
-"[Qq]uarter","quarter"
+"[Dd]ays?","day"
+"[Ww]eek-?ends?","week-WE"
+"[Ww]eeks?","week"
+"[Mm]onths?","month"
+"[Qq]uarters?","quarter"
+"[Yy]ears?","year"
"[Ff]iscal years?","year"
-"[Yy]ear","year"
-"[Dd]ecade","decade"
+"[Dd]ecades?","decade"
"[Cc]entury","century"
-// Plurals (not in reUnit)
-"[Dd]ays","day"
-"[Ww]eek-ends","week-WE"
-"[Ww]eekends","week-WE"
-"[Ww]eeks","week"
-"[Mm]onths","month"
-"[Qq]uarters","quarter"
-"[Yy]ears","year"
-"[Dd]ecades","decade"
-// not in reUnit
"[Cc]enturies","century"
+"[Tt]rading days?","day"
"[Hh]ours?","hour"
"[Mm]inutes?","minute"
-"[Tt]rading days?","day"
-
+"[Ss]econds?","second"
diff --git a/resources/english/normalization/resources_normalization_normWeekday.txt b/resources/english/normalization/resources_normalization_normWeekday.txt
index 47ae295b..d7da1f6f 100644
--- a/resources/english/normalization/resources_normalization_normWeekday.txt
+++ b/resources/english/normalization/resources_normalization_normWeekday.txt
@@ -18,4 +18,4 @@
"Friday","friday"
"Saturday","saturday"
"Sunday","sunday"
-
+"[Ww]e[dn][nd]e?sday","wednesday"
diff --git a/resources/english/normalization/resources_normalization_normYearPrefix.txt b/resources/english/normalization/resources_normalization_normYearPrefix.txt
index 672c8800..f232435e 100644
--- a/resources/english/normalization/resources_normalization_normYearPrefix.txt
+++ b/resources/english/normalization/resources_normalization_normYearPrefix.txt
@@ -4,7 +4,11 @@
// This file contains "BCyears" and their normalized expressions.
// For example, the normalized value of "BC" is "BC"
// FORMAT: "yearPrefix","normalized-yearPrefix"
-"B[\.]?C[\.]?","BC"
-"A[\.]?D[\.]?",""
+"BC","BC"
+"AD",""
+"B\.C\.","BC"
+"A\.D\.",""
"BCE","BC"
-"CE",""
\ No newline at end of file
+"CE",""
+"B\.C\.E\.","BC"
+"C\.E\.",""
\ No newline at end of file
diff --git a/resources/english/repattern/resources_repattern_reAndOrTo.txt b/resources/english/repattern/resources_repattern_reAndOrTo.txt
index 73f4ad21..ebac0311 100644
--- a/resources/english/repattern/resources_repattern_reAndOrTo.txt
+++ b/resources/english/repattern/resources_repattern_reAndOrTo.txt
@@ -3,9 +3,5 @@
// date: 2013-10-17
// This file contains regular expression patterns for "and", "or", "to" words.
// FORMAT: one line is one disjunction of the pattern
-[\s]?\–[\s]?
-[\s]?-[\s]?
-[\s]?–[\s]?
- and
- or
- to
\ No newline at end of file
+\s(?:and|or|to)\s
+\s?[/–‒‑-]\s?
diff --git a/resources/english/repattern/resources_repattern_reApproximate.txt b/resources/english/repattern/resources_repattern_reApproximate.txt
index bf98fd31..302d3d31 100644
--- a/resources/english/repattern/resources_repattern_reApproximate.txt
+++ b/resources/english/repattern/resources_repattern_reApproximate.txt
@@ -3,20 +3,11 @@
// date: 2011-06-10
// This file contains regular expression patterns for "approximate" words.
// FORMAT: one line is one disjunction of the pattern
-// about
-[Aa]pproximately
-[Aa]bout
-[Aa]round
+// 2016-01-13 Folded patterns by first letter for performance -- Erich
+[Aa](?:bout|lmost|pproximately|round|t least)
[Cc]irca
-// less
-[Nn]o more than
-[Nn]o longer than
-[Uu]p to
-[Ll]ess than
-[Nn]early
-[Aa]lmost
-// more
-[Aa]t least
+[Ll](?:ess|onger) than
[Mm]ore than
-[Ll]onger than
+[Nn](?:early|o (?:more|longer) than)
[Oo]ver
+[Uu]p to
diff --git a/resources/english/repattern/resources_repattern_reDateWord.txt b/resources/english/repattern/resources_repattern_reDateWord.txt
index f44a1da7..f6a975e1 100644
--- a/resources/english/repattern/resources_repattern_reDateWord.txt
+++ b/resources/english/repattern/resources_repattern_reDateWord.txt
@@ -13,10 +13,8 @@
[Rr]ight now
[Nn]ow
[Aa]s soon as possible
-[Rr]ecently
-[Rr]ecent
-[Cc]urrently
-[Cc]urrent
+[Rr]ecent(?:ly)?
+[Cc]urrent(?:ly)?
// [Ss]oon
// [Pp]reviously
// [Yy]et
diff --git a/resources/english/repattern/resources_repattern_reDayNumber.txt b/resources/english/repattern/resources_repattern_reDayNumber.txt
index f045823a..c5364d53 100644
--- a/resources/english/repattern/resources_repattern_reDayNumber.txt
+++ b/resources/english/repattern/resources_repattern_reDayNumber.txt
@@ -3,6 +3,8 @@
// date: 2011-06-10
// This file contains regular expression patterns for day numbers.
// FORMAT: one line is one disjunction of the pattern
-[12][0-9]
+0[1-9]
+1[0-9]
+2[0-9]
3[01]
-0?[1-9]
\ No newline at end of file
+[1-9]
diff --git a/resources/english/repattern/resources_repattern_reDayNumberTh.txt b/resources/english/repattern/resources_repattern_reDayNumberTh.txt
index 9465d9dc..153f0660 100644
--- a/resources/english/repattern/resources_repattern_reDayNumberTh.txt
+++ b/resources/english/repattern/resources_repattern_reDayNumberTh.txt
@@ -3,16 +3,19 @@
// date: 2011-06-10
// This file contains regular expression patterns for day digit th.
// FORMAT: one line is one disjunction of the pattern
-[123]0th
-[23]?1st
-[2]?2nd
-[2]?3rd
-[12]?4th
-[12]?5th
-[12]?6th
-[12]?7th
-[12]?8th
-[12]?9th
-11th
-12th
-13th
\ No newline at end of file
+1st
+01st
+2nd
+02nd
+3rd
+03rd
+[4-9]th
+0[4-9]th
+1[0-9]th
+20th
+21st
+22nd
+23rd
+2[4-9]th
+30th
+31st
diff --git a/resources/english/repattern/resources_repattern_reDayWordTh.txt b/resources/english/repattern/resources_repattern_reDayWordTh.txt
index 2d5eae0b..cc813c39 100644
--- a/resources/english/repattern/resources_repattern_reDayWordTh.txt
+++ b/resources/english/repattern/resources_repattern_reDayWordTh.txt
@@ -3,6 +3,16 @@
// date: 2011-06-10
// This file contains regular expression patterns for day word th.
// FORMAT: one line is one disjunction of the pattern
+[Ff]irst
+[Ss]econd
+[Tt]hird
+[Ff]ourth
+[Ff]ifth
+[Ss]ixth
+[Ss]eventh
+[Ee]ighth
+[Nn]inth
+// 10
[Tt]enth
[Ee]leventh
[Tt]welfth
@@ -13,24 +23,9 @@
[Ss]eventeenth
[Ee]ighteenth
[Nn]ineteenth
+// 20
[Tt]wentieth
-[Tt]wenty-first
-[Tt]wenty-second
-[Tt]wenty-third
-[Tt]wenty-fourth
-[Tt]wenty-fifth
-[Tt]wenty-sixth
-[Tt]wenty-seventh
-[Tt]wenty-eighth
-[Tt]wenty-ninth
+[Tt]wenty-(?:first|second|third|fourth|fifth|sixth|seventh|eighth|ninth)
+// 30
[Tt]hirtieth
[Tt]hirty-first
-[Ff]irst
-[Ss]econd
-[Tt]hird
-[Ff]ourth
-[Ff]ifth
-[Ss]ixth
-[Ss]eventh
-[Ee]ighth
-[Nn]inth
\ No newline at end of file
diff --git a/resources/english/repattern/resources_repattern_reHolidayFix.txt b/resources/english/repattern/resources_repattern_reHolidayFix.txt
index 0c0820a6..3d111591 100644
--- a/resources/english/repattern/resources_repattern_reHolidayFix.txt
+++ b/resources/english/repattern/resources_repattern_reHolidayFix.txt
@@ -5,62 +5,48 @@
// FORMAT: one line is one disjunction of the pattern
// http://en.wikipedia.org/wiki/New_Year%27s_Day
-New Year's Day
+New Year'?s [Dd]ay
// http://en.wikipedia.org/wiki/Groundhog_Day
-Groundhog Day
+Groundhog [Dd]ay
// http://en.wikipedia.org/wiki/Valentine%27s_Day
-Saint Valentines Day
-St\. Valentines Day
-Valentines Day
-St\. Valentine's Day
-Saint Valentine's Day
-Valentine's Day
-Saint Valentines' Day
-St\. Valentines' Day
-Valentines' Day
+(?:Saint |St\. )?Valentine'?s'? [Dd]ay
// http://en.wikipedia.org/wiki/Patriots%27_Day
-Patriots Day
-Patriots' Day
-Patriot's Day
+Patriot'?s'? [Dd]ay
// http://en.wikipedia.org/wiki/German-American_Day
-German-American Day
+German-American [Dd]ay
// http://en.wikipedia.org/wiki/White_Cane_Safety_Day
-White Cane Safety Day
+White Cane Safety [Dd]ay
// http://en.wikipedia.org/wiki/Boss%27s_Day
-Boss's Day
-Bosses Day
-Bosses' Day
+Boss(?:'s|es|es') [Dd]ay
//won't work, because of the "$" in the string
-//The Bo$$ Day
-National Boss Day
+The Bo\$\$ [Dd]ay
+National Boss [Dd]ay
// http://en.wikipedia.org/wiki/Independence_Day_%28US%29
-Independence Day
+Independence [Dd]ay
Fourth of July
// http://en.wikipedia.org/wiki/Veterans_Day
-Veterans Day
-Armistice Day
+Veterans [Dd]ay
+Armistice [Dd]ay
// http://en.wikipedia.org/wiki/Remembrance_Day
-Remembrance Day
-Poppy Day
+Remembrance [Dd]ay
+Poppy [Dd]ay
// http://en.wikipedia.org/wiki/Epiphany_%28holiday%29
Epiphany
Theophany
// http://en.wikipedia.org/wiki/Hallowe%27en
-Halloween
-All Hallows’ Evening
-Hallowe'en
-All Hallows' Eve
+Hallowe'?en
+All Hallows' [Ee]ve(?:ning)?
// http://en.wikipedia.org/wiki/Assumption_of_Mary
Assumption of Mary
@@ -68,65 +54,35 @@ Assumption of the Blessed Virgin Mary into Heaven
The Assumption
// http://en.wikipedia.org/wiki/Reformation_Day
-Reformation Day
+Reformation [Dd]ay
// http://en.wikipedia.org/wiki/All_Saints%27_Day
-All Saints' Day
-All Saints
+All Saints'?(?: [Dd]ay)?
Solemnity of All Saints
All Hallows
Hallowmas
// http://en.wikipedia.org/wiki/Christmas
-Christmas Eve
-Christmas Day
-Christmas
-Xmas
-XMAS
+Christmas(?: [Ee]ve| [Dd]ay)?
+X-?(?:mas|MAS)
Noel
Yule
// http://en.wikipedia.org/wiki/New_Year%27s_Eve
-New Year's Eve
+New Year'?s [Ee]ve
Hogmanay
Calennig
// http://en.wikipedia.org/wiki/May_Day
-May Day
-International Workers Day
-International Worker's Day
-International Workers' Day
+May [Dd]ay
+International Worker'?s'? [Dd]ay
// http://en.wikipedia.org/wiki/Boxing_Day
-Boxing Day
+Boxing [Dd]ay
// http://en.wikipedia.org/wiki/St._Patrick%27s_Day
-Saint Patricks Day
-St\. Patricks Day
-St\. Paddys Day
-Saint Paddys Day
-St\. Pattys Day
-Saint Pattys Day
-
-Saint Patrick's Day
-St\. Patrick's Day
-St\. Paddy's Day
-Saint Paddy's Day
-St\. Patty's Day
-Saint Patty's Day
-
-Saint Patricks' Day
-St\. Patricks' Day
-St\. Paddys' Day
-Saint Paddys' Day
-St\. Pattys' Day
-Saint Pattys' Day
+(?:Saint|St\.) Pa(?:trick|ddy|tty)'?s'? [Dd]ay
// http://en.wikipedia.org/wiki/St._Andrew%27s_Day
-Saint Andrews Day
-Saint Andrew's Day
-Saint Andrews' Day
-St\. Andrews Day
-St\. Andrew's Day
-St\. Andrews' Day
+(?:Saint|St\.) Andrew'?s'? [Dd]ay
diff --git a/resources/english/repattern/resources_repattern_reHolidayVar.txt b/resources/english/repattern/resources_repattern_reHolidayVar.txt
index 56b970dd..d0b7af68 100644
--- a/resources/english/repattern/resources_repattern_reHolidayVar.txt
+++ b/resources/english/repattern/resources_repattern_reHolidayVar.txt
@@ -35,23 +35,17 @@ The Great Sabbath
Joyous Saturday
// http://en.wikipedia.org/wiki/Easter
-Easter Sunday
-Easter Day
-Easter
-Resurrection Day
-Resurrection Sunday
+Easter(?: ?[Ss]unday| ?[Dd]ay)?
+Resurrection(?: ?[Ss]unday| ?[Dd]ay)?
// http://en.wikipedia.org/wiki/Easter_Monday
Easter Monday
// http://en.wikipedia.org/wiki/Octave_of_Easter
-Octave of Easter
-Octave Day of Easter
+Octave(?: [Dd]ay)? of Easter
Low Sunday
-Saint Thomas Sunday
-St\. Thomas Sunday
-Quasimodo Sunday
-Quasimodogeniti
+(?:Saint|St\.) Thomas Sunday
+Quasimodo(?: Sunday|geniti)
Second Sunday of Easter
Divine Mercy Sunday
@@ -61,36 +55,30 @@ Solemnity of the Ascension of the Lord
Ascension Thursday
// http://en.wikipedia.org/wiki/Father%27s_Day
-Father's Day
+Father'?s'? [Dd]ay
// http://en.wikipedia.org/wiki/Reformation_Day
Reformation Sunday
// http://en.wikipedia.org/wiki/Pentecost
Pentecost
-Whit Sunday
-Whitsun
-Whit
+Whit(?:sun| ?[Ss]unday)?
// http://en.wikipedia.org/wiki/Corpus_Christi_%28feast%29
Corpus Christi
The Most Holy Body and Blood of Christ
// advent sundays: the four sundays before christmas(12-24)
-1\. Advent
-1st Advent
+1(?:st|\.)? Advent
first Advent
-2\. Advent
-2nd Advent
+2(?:nd|\.)? Advent
second Advent
-3\. Advent
-3rd Advent
+3(?:rd|\.)? Advent
third Advent
-4\. Advent
-4th Advent
+4(?:th|\.)? Advent
fourth Advent
// http://en.wikipedia.org/wiki/Black_Friday_%28shopping%29
@@ -98,28 +86,24 @@ Black Friday
// http://en.wikipedia.org/wiki/Martin_Luther_King,_Jr._Day
Birthday of Martin Luther King, Jr\.
-Martin Luther King Day
-Martin Luther King, Jr\. Day
+Martin Luther King(?:, Jr\.)? [Dd]ay
// http://en.wikipedia.org/wiki/Washington%27s_Birthday
-Presidents Day
-Presidents' Day
-President's Day
+President'?s'? [Dd]ay
Washington's Birthday
// http://en.wikipedia.org/wiki/Memorial_Day
-Memorial Day
-Decoration Day
+Memorial [Dd]ay
+Decoration [Dd]ay
// http://en.wikipedia.org/wiki/Labor_Day
-Labor Day
+Labor [Dd]ay
// http://en.wikipedia.org/wiki/Columbus_Day
-Columbus Day
+Columbus [Dd]ay
// http://en.wikipedia.org/wiki/Thanksgiving_%28United_States%29
-Thanksgiving Day
-Thanksgiving
+Thanksgiving(?: [Dd]ay)?
// http://en.wikipedia.org/wiki/Mother%27s_Day
-Mother's Day
+Mother'?s'? [Dd]ay
diff --git a/resources/english/repattern/resources_repattern_reMonthLong.txt b/resources/english/repattern/resources_repattern_reMonthLong.txt
index 39c76d84..b8e80c1d 100644
--- a/resources/english/repattern/resources_repattern_reMonthLong.txt
+++ b/resources/english/repattern/resources_repattern_reMonthLong.txt
@@ -3,14 +3,11 @@
// date: 2011-06-10
// This file contains regular expression patterns for long months.
// FORMAT: one line is one disjunction of the pattern
-[Jj]anuary
+// 2016-01-13 Folded patterns by first letter for performance -- Erich
+[Jj](?:anuary|une|uly)
[Ff]ebruary
-[Mm]arch
-[Aa]pril
-[Mm]ay
-[Jj]une
-[Jj]uly
-[Aa]ugust
+[Mm](?:arch|ay)
+[Aa](?:pril|ugust)
[Ss]eptember
[Oo]ctober
[Nn]ovember
diff --git a/resources/english/repattern/resources_repattern_reMonthNumber.txt b/resources/english/repattern/resources_repattern_reMonthNumber.txt
index 3435e188..6e419f5d 100644
--- a/resources/english/repattern/resources_repattern_reMonthNumber.txt
+++ b/resources/english/repattern/resources_repattern_reMonthNumber.txt
@@ -3,7 +3,5 @@
// date: 2011-06-10
// This file contains regular expression patterns for month numbers.
// FORMAT: one line is one disjunction of the pattern
-10
-11
-12
+1[0-2]
0?[1-9]
\ No newline at end of file
diff --git a/resources/english/repattern/resources_repattern_reMonthShort.txt b/resources/english/repattern/resources_repattern_reMonthShort.txt
index 513bf92a..073ff89f 100644
--- a/resources/english/repattern/resources_repattern_reMonthShort.txt
+++ b/resources/english/repattern/resources_repattern_reMonthShort.txt
@@ -3,53 +3,17 @@
// date: 2011-06-10
// This file contains regular expression patterns for short months.
// FORMAT: one line is one disjunction of the pattern
-[Jj]an\.
-[Jj]an
-[Ff]eb\.
-[Ff]eb
-[Mm]ar\.
-[Mm]ar
-[Aa]pr\.
-[Aa]pr
-[Mm]ay
-[Jj]un\.
-[Jj]un
-[Jj]ul\.
-[Jj]ul
-[Aa]ug\.
-[Aa]ug
-[Ss]ep\.
-[Ss]ep
-[Ss]ept\.
-[Ss]ept
-[Oo]ct\.
-[Oo]ct
-[Nn]ov\.
-[Nn]ov
-[Dd]ec\.
-[Dd]ec
-JAN\.
-JAN
-FEB\.
-FEB
-MAR\.
-MAR
-APR\.
-APR
-MAY
-JUN\.
-JUN
-JUL\.
-JUL
-AUG\.
-AUG
-SEP\.
-SEP
-SEPT\.
-SEPT
-OCT\.
-OCT
-NOV\.
-NOV
-DEC\.
-DEC
\ No newline at end of file
+// 2016-05-09: note that \.? causes problems with the pattern optimizer. Use (?:\.|) instead. -- Erich
+(?:jan|Jan|JAN)(?:\.|)
+(?:feb|Feb|FEB)(?:\.|)
+(?:mar|Mar|MAR)(?:\.|)
+(?:apr|Apr|APR)(?:\.|)
+// May (only three letters, no dot)
+(?:may|May|MAY)
+(?:jun|Jun|JUN)(?:\.|)
+(?:jul|Jul|JUL)(?:\.|)
+(?:aug|Aug|AUG)(?:\.|)
+(?:sep|Sep|SEP)(?:\.|)
+(?:oct|Oct|OCT)(?:\.|)
+(?:nov|Nov|NOV)(?:\.|)
+(?:dec|Dec|DEC)(?:\.|)
\ No newline at end of file
diff --git a/resources/english/repattern/resources_repattern_reNumWord2D.txt b/resources/english/repattern/resources_repattern_reNumWord2D.txt
index 75bfc7d6..f6d07582 100644
--- a/resources/english/repattern/resources_repattern_reNumWord2D.txt
+++ b/resources/english/repattern/resources_repattern_reNumWord2D.txt
@@ -14,83 +14,20 @@
[Ss]eventeen
[Ee]ighteen
[Nn]ineteen
-[Tt]wenty[ -]one
-[Tt]hirty[ -]one
-[Ff]orty[ -]one
-[Ff]ifty[ -]one
-[Ss]ixty[ -]one
-[Ss]eventy[ -]one
-[Ee]ighty[ -]one
-[Nn]inety[ -]one
-[Tt]wenty[ -]two
-[Tt]hirty[ -]two
-[Ff]orty[ -]two
-[Ff]ifty[ -]two
-[Ss]ixty[ -]two
-[Ss]eventy[ -]two
-[Ee]ighty[ -]two
-[Nn]inety[ -]two
-[Tt]wenty[ -]three
-[Tt]hirty[ -]three
-[Ff]orty[ -]three
-[Ff]ifty[ -]three
-[Ss]ixty[ -]three
-[Ss]eventy[ -]three
-[Ee]ighty[ -]three
-[Nn]inety[ -]three
-[Tt]wenty[ -]four
-[Tt]hirty[ -]four
-[Ff]orty[ -]four
-[Ff]ifty[ -]four
-[Ss]ixty[ -]four
-[Ss]eventy[ -]four
-[Ee]ighty[ -]four
-[Nn]inety[ -]four
-[Tt]wenty[ -]five
-[Tt]hirty[ -]five
-[Ff]orty[ -]five
-[Ff]ifty[ -]five
-[Ss]ixty[ -]five
-[Ss]eventy[ -]five
-[Ee]ighty[ -]five
-[Nn]inety[ -]five
-[Tt]wenty[ -]six
-[Tt]hirty[ -]six
-[Ff]orty[ -]six
-[Ff]ifty[ -]six
-[Ss]ixty[ -]six
-[Ss]eventy[ -]six
-[Ee]ighty[ -]six
-[Nn]inety[ -]six
-[Tt]wenty[ -]seven
-[Tt]hirty[ -]seven
-[Ff]orty[ -]seven
-[Ff]ifty[ -]seven
-[Ss]ixty[ -]seven
-[Ss]eventy[ -]seven
-[Ee]ighty[ -]seven
-[Nn]inety[ -]seven
-[Tt]wenty[ -]eight
-[Tt]hirty[ -]eight
-[Ff]orty[ -]eight
-[Ff]ifty[ -]eight
-[Ss]ixty[ -]eight
-[Ss]eventy[ -]eight
-[Ee]ighty[ -]eight
-[Nn]inety[ -]eight
-[Tt]wenty[ -]nine
-[Tt]hirty[ -]nine
-[Ff]orty[ -]nine
-[Ff]ifty[ -]nine
-[Ss]ixty[ -]nine
-[Ss]eventy[ -]nine
-[Ee]ighty[ -]nine
-[Nn]inety[ -]nine
+//
[Tt]wenty
+[Tt]wenty[ -]?(?:one|two|three|four|five|six|seven|eight|nine)
[Tt]hirty
+[Tt]hirty[ -]?(?:one|two|three|four|five|six|seven|eight|nine)
[Ff]orty
+[Ff]orty[ -]?(?:one|two|three|four|five|six|seven|eight|nine)
[Ff]ifty
+[Ff]ifty[ -]?(?:one|two|three|four|five|six|seven|eight|nine)
[Ss]ixty
+[Ss]ixty[ -]?(?:one|two|three|four|five|six|seven|eight|nine)
[Ss]eventy
+[Ss]eventy[ -]?(?:one|two|three|four|five|six|seven|eight|nine)
[Ee]ighty
+[Ee]ighty[ -]?(?:one|two|three|four|five|six|seven|eight|nine)
[Nn]inety
+[Nn]inety[ -]?(?:one|two|three|four|five|six|seven|eight|nine)
diff --git a/resources/english/repattern/resources_repattern_reNumWordTeen.txt b/resources/english/repattern/resources_repattern_reNumWordTeen.txt
index 1706b070..e7e1bb49 100644
--- a/resources/english/repattern/resources_repattern_reNumWordTeen.txt
+++ b/resources/english/repattern/resources_repattern_reNumWordTeen.txt
@@ -3,14 +3,14 @@
// date: 2011-06-10
// This file contains regular expression patterns for number words 10 to 20.
// FORMAT: one line is one disjunction of the pattern
-ten
-eleven
-twelve
-thirteen
-fourteen
-fifteen
-sixteen
-seventeen
-eighteen
+// 2016-01-13 Folded patterns by first letter for performance -- Erich
+// 10, 12, 13, 20
+t(?:en|welve|hirteen|wenty)
+// 11, 18
+e(?:leven|ighteen)
+// 14, 15
+f(?:ourteen|ifteen)
+// 16, 17
+s(?:ixteen|eventeen)
+// 19
nineteen
-twenty
\ No newline at end of file
diff --git a/resources/english/repattern/resources_repattern_rePartOfDay.txt b/resources/english/repattern/resources_repattern_rePartOfDay.txt
index f65c42ea..ad4ac036 100644
--- a/resources/english/repattern/resources_repattern_rePartOfDay.txt
+++ b/resources/english/repattern/resources_repattern_rePartOfDay.txt
@@ -3,12 +3,9 @@
// date: 2011-06-10
// This file contains regular expression patterns for parts of days.
// FORMAT: one line is one disjunction of the pattern
-[Mm]id-afternoon
+// 2016-01-13 Folded patterns by first letter for performance -- Erich
+[Mm](?:orning|id-afternoon|idnight|id-day)
+[Nn](?:oon|ight)
[Aa]fternoon
-[Nn]oon
-[Mm]idnight
-[Mm]id-day
-[Nn]ight
[Tt]onight
-[Mm]orning
[Ee]vening
\ No newline at end of file
diff --git a/resources/english/repattern/resources_repattern_rePartOfYear.txt b/resources/english/repattern/resources_repattern_rePartOfYear.txt
index ee2ff593..1ee98b63 100644
--- a/resources/english/repattern/resources_repattern_rePartOfYear.txt
+++ b/resources/english/repattern/resources_repattern_rePartOfYear.txt
@@ -3,61 +3,13 @@
// date: 2011-06-10
// This file contains regular expression patterns for parts of year.
// FORMAT: one line is one disjunction of the pattern
-[Ff]iscal-first quarter
-[Ff]iscal-first half
-[Ff]iscal-first-quarter
-[Ff]iscal-first-half
-[Ff]iscal-second quarter
-[Ff]iscal-second half
-[Ff]iscal-second-quarter
-[Ff]iscal-second-half
-[Ff]iscal-third quarter
-[Ff]iscal-third half
-[Ff]iscal-third-quarter
-[Ff]iscal-third-half
-[Ff]iscal-fourth quarter
-[Ff]iscal-fourth half
-[Ff]iscal-fourth-quarter
-[Ff]iscal-fourth-half
-[Ff]iscal-forth quarter
-[Ff]iscal-forth half
-[Ff]iscal-forth-quarter
-[Ff]iscal-forth-half
-[Ff]iscal first quarter
-[Ff]iscal first half
-[Ff]iscal first-quarter
-[Ff]iscal first-half
-[Ff]iscal second quarter
-[Ff]iscal second half
-[Ff]iscal second-quarter
-[Ff]iscal second-half
-[Ff]iscal third quarter
-[Ff]iscal third-quarter
-[Ff]iscal fourth quarter
-[Ff]iscal fourth-quarter
-[Ff]iscal forth quarter
-[Ff]iscal forth-quarter
-[Ff]irst quarter
-[Ff]irst half
-[Ff]irst-quarter
-[Ff]irst-half
-[Ss]econd quarter
-[Ss]econd half
-[Ss]econd-quarter
-[Ss]econd-half
-[Tt]hird quarter
-[Tt]hird-quarter
-[Ff]ourth quarter
-[Ff]ourth-quarter
-[Ff]orth quarter
-[Ff]orth-quarter
-[Ll]ast quarter
-[Ll]ast half
-[Ll]ast-quarter
-[Ll]ast-half
-1st quarter
-2nd quarter
-3rd quarter
-4th quarter
-1st half
-2nd half
\ No newline at end of file
+[Ff]iscal[ -](?:first|second|third|fou?rth)[ -](?:half|quarter)
+[Ff]irst[ -](?:half|quarter)
+[Ss]econd[ -](?:half|quarter)
+[Tt]hird[ -]quarter
+[Ff]ou?rth[ -]quarter
+[Ll]ast[ -](?:half|quarter)
+1st[ -](?:half|quarter)
+2nd[ -](?:half|quarter)
+3rd[ -]quarter
+4th[ -]quarter
diff --git a/resources/english/repattern/resources_repattern_rePartWords.txt b/resources/english/repattern/resources_repattern_rePartWords.txt
index 29253adf..bc34d850 100644
--- a/resources/english/repattern/resources_repattern_rePartWords.txt
+++ b/resources/english/repattern/resources_repattern_rePartWords.txt
@@ -3,15 +3,12 @@
// date: 2011-06-10
// This file contains regular expression patterns for "part" words.
// FORMAT: one line is one disjunction of the pattern
-[Tt]he middle of
-[Tt]he end of
-[Tt]he beginning of
-[Tt]he start of
-[Ll]ate
-[Ll]ater
-[Ee]arly
-[Ee]arlier
-[Mm]id-
-[Mm]id
-[Ff]iscal-
-[Ff]iscal
\ No newline at end of file
+[Tt]he (?:start|beginning|middle|end) of
+[Ss]tart of
+[Bb]eginning of
+[Mm]iddle of
+[Ee]nd of
+[Ll]ater?
+[Ee]arl(?:y|ier)
+[Mm]id-?
+[Ff]iscal-?
\ No newline at end of file
diff --git a/resources/english/repattern/resources_repattern_reSeason.txt b/resources/english/repattern/resources_repattern_reSeason.txt
index 5f079816..d65181f1 100644
--- a/resources/english/repattern/resources_repattern_reSeason.txt
+++ b/resources/english/repattern/resources_repattern_reSeason.txt
@@ -3,8 +3,7 @@
// date: 2011-06-10
// This file contains regular expression patterns for seasons.
// FORMAT: one line is one disjunction of the pattern
-[Ss]pring
-[Ss]ummer
+[Ss](?:pring|ummer)
[Ff]all
[Aa]utumn
[Ww]inter
\ No newline at end of file
diff --git a/resources/english/repattern/resources_repattern_reTimeHour.txt b/resources/english/repattern/resources_repattern_reTimeHour.txt
index f9795cef..d6c1d19c 100644
--- a/resources/english/repattern/resources_repattern_reTimeHour.txt
+++ b/resources/english/repattern/resources_repattern_reTimeHour.txt
@@ -3,19 +3,6 @@
// date: 2011-06-10
// This file contains regular expression patterns for time hours.
// FORMAT: one line is one disjunction of the pattern
-10
-11
-12
-13
-14
-15
-16
-17
-18
-19
-20
-21
-22
-23
-24
+1[0-9]
+2[0-4]
0?[0-9]
\ No newline at end of file
diff --git a/resources/english/repattern/resources_repattern_reTimeMinute.txt b/resources/english/repattern/resources_repattern_reTimeMinute.txt
index 6e560f7a..40572855 100644
--- a/resources/english/repattern/resources_repattern_reTimeMinute.txt
+++ b/resources/english/repattern/resources_repattern_reTimeMinute.txt
@@ -3,4 +3,4 @@
// date: 2011-06-10
// This file contains regular expression patterns for time minutes.
// FORMAT: one line is one disjunction of the pattern
-[0|1|2|3|4|5][0-9]
\ No newline at end of file
+[0-5][0-9]
\ No newline at end of file
diff --git a/resources/english/repattern/resources_repattern_reTimezone.txt b/resources/english/repattern/resources_repattern_reTimezone.txt
index 6864e1a7..201eb1a6 100644
--- a/resources/english/repattern/resources_repattern_reTimezone.txt
+++ b/resources/english/repattern/resources_repattern_reTimezone.txt
@@ -3,6 +3,12 @@
// date: 2011-06-10
// This file contains regular expression patterns for time zones.
// FORMAT: one line is one disjunction of the pattern
-EST
-EDT
+// 2016-01-13 Added more (important) time zones -- Erich
GMT
+UTC
+// Important US time zones (Omitted Alaska)
+[EPCMH][SD]T
+// Central Europe
+CE[SD]?T(?: DST)?
+// Japan
+JST
\ No newline at end of file
diff --git a/resources/english/repattern/resources_repattern_reUnit.txt b/resources/english/repattern/resources_repattern_reUnit.txt
index d9049720..1df562c9 100644
--- a/resources/english/repattern/resources_repattern_reUnit.txt
+++ b/resources/english/repattern/resources_repattern_reUnit.txt
@@ -3,16 +3,13 @@
// date: 2011-06-10
// This file contains regular expression patterns for unit words.
// FORMAT: one line is one disjunction of the pattern
-[Tt]rading days?
[Dd]ays?
-[Ww]eek-ends?
-[Ww]eekends?
-[Ww]eeks?
+[Ww]eek(?:-?end)?s?
[Mm]onths?
[Qq]uarters?
[Ff]iscal years?
[Yy]ears?
[Dd]ecades?
-[Cc]entury
-[Cc]enturies
+[Cc]entur(?:y|ies)
+[Tt]rading days?
diff --git a/resources/english/repattern/resources_repattern_reUnitFine.txt b/resources/english/repattern/resources_repattern_reUnitFine.txt
new file mode 100644
index 00000000..5b843762
--- /dev/null
+++ b/resources/english/repattern/resources_repattern_reUnitFine.txt
@@ -0,0 +1,17 @@
+// author: Jannik Strötgen
+// email: stroetgen@uni-hd.de
+// date: 2011-06-10
+// This file contains regular expression patterns for unit words.
+// FORMAT: one line is one disjunction of the pattern
+[Hh]ours?
+[Mm]inutes?
+[Ss]econds?
+[Dd]ays?
+[Ww]eek(?:-?end)?s?
+[Mm]onths?
+[Qq]uarters?
+[Ff]iscal years?
+[Yy]ears?
+[Dd]ecades?
+[Cc]entur(?:y|ies)
+[Tt]rading days?
diff --git a/resources/english/repattern/resources_repattern_reWeekday.txt b/resources/english/repattern/resources_repattern_reWeekday.txt
index 5218f98d..069cba47 100644
--- a/resources/english/repattern/resources_repattern_reWeekday.txt
+++ b/resources/english/repattern/resources_repattern_reWeekday.txt
@@ -3,10 +3,12 @@
// date: 2011-06-10
// This file contains regular expression patterns for weekdays.
// FORMAT: one line is one disjunction of the pattern
+// 2016-01-10 Allow some spelling mistakes of Wednesday -- Erich
+// 2016-01-13 Folded patterns by first letter for performance -- Erich
[Mm]onday
[Tt]uesday
-[Ww]ednesday
+// Allow spelling errors:
+[Ww]e[dn][dn]e?sday
[Tt]hursday
[Ff]riday
-[Ss]aturday
-[Ss]unday
\ No newline at end of file
+[Ss](?:aturday|unday)
diff --git a/resources/english/repattern/resources_repattern_reYear2Digit.txt b/resources/english/repattern/resources_repattern_reYear2Digit.txt
index d0ed65e6..4bb18e18 100644
--- a/resources/english/repattern/resources_repattern_reYear2Digit.txt
+++ b/resources/english/repattern/resources_repattern_reYear2Digit.txt
@@ -3,4 +3,4 @@
// date: 2011-06-10
// This file contains regular expression patterns for year numbers (2 digit).
// FORMAT: one line is one disjunction of the pattern
-\d\d
\ No newline at end of file
+[0-9][0-9]
\ No newline at end of file
diff --git a/resources/english/repattern/resources_repattern_reYear4Digit.txt b/resources/english/repattern/resources_repattern_reYear4Digit.txt
index 99ae2548..30615d13 100644
--- a/resources/english/repattern/resources_repattern_reYear4Digit.txt
+++ b/resources/english/repattern/resources_repattern_reYear4Digit.txt
@@ -3,4 +3,8 @@
// date: 2011-06-10
// This file contains regular expression patterns for year numbers (4 digits).
// FORMAT: one line is one disjunction of the pattern
-[12]\d\d\d
\ No newline at end of file
+1[0-9][0-9][0-9]
+// Be conservative with future dates beyond 2100:
+20[0-9][0-9]
+2100
+2200
diff --git a/resources/english/repattern/resources_repattern_reYearBC.txt b/resources/english/repattern/resources_repattern_reYearBC.txt
index 68a4665c..cdfd1da9 100644
--- a/resources/english/repattern/resources_repattern_reYearBC.txt
+++ b/resources/english/repattern/resources_repattern_reYearBC.txt
@@ -3,7 +3,4 @@
// date: 2011-06-10
// This file contains regular expression patterns for year numbers (4 digits).
// FORMAT: one line is one disjunction of the pattern
-\d\d\d\d
-\d\d\d
-\d\d
-\d
+[0-9][0-9]?[0-9]?[0-9]?
diff --git a/resources/english/repattern/resources_repattern_reYearPrefix.txt b/resources/english/repattern/resources_repattern_reYearPrefix.txt
index 1964fee0..89ff7145 100644
--- a/resources/english/repattern/resources_repattern_reYearPrefix.txt
+++ b/resources/english/repattern/resources_repattern_reYearPrefix.txt
@@ -3,11 +3,11 @@
// date: 2011-06-10
// This file contains regular expression patterns for year numbers (4 digits).
// FORMAT: one line is one disjunction of the pattern
-BC
-B[\.]C[\.]
-B[\.]C
+A\.D\.
AD
-A[\.]D[\.]
-A[\.]D
+B\.C\.
+BC
+B\.C\.E\.
BCE
+C\.E\.
CE
\ No newline at end of file
diff --git a/resources/english/rules/resources_rules_daterules.txt b/resources/english/rules/resources_rules_daterules.txt
index f8b79722..a8dfc7bf 100644
--- a/resources/english/rules/resources_rules_daterules.txt
+++ b/resources/english/rules/resources_rules_daterules.txt
@@ -4,93 +4,6 @@
// This file contains rules for the temporal expressions of the type date: daterules
// RULENAME="",EXTRACTION="",NORM_VALUE=""(,OFFSET="")?(,POS_CONSTRAINT="")?(,NORM_MOD="")?(,NORM_QUANT="")?(,NORM_FREQ="")?
-// Note: rule with "-BCADhint" in the rule name contain explicit BC or AD information.
-// This information is important during the normalization process.
-
-///////////////////
-// History RULES //
-///////////////////
-
-// historic dates; year granularity; with explicit AD / BC hints
-// EXAMPLE historic_1a-BCADhint: 190 BC (1- to 4-digit year)
-// EXAMPLE historic_1b-BCADhint: BC 190 (1- to 4-digit year)
-// EXAMPLE historic_1c-BCADhint: 190 or 180 BC (find "190 BC"; 1- to 4-digit year)
-RULENAME="date_historic_1a-BCADhint",EXTRACTION="(%reApproximate )?%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="date_historic_1b-BCADhint",EXTRACTION="(%reApproximate )?%reYearPrefix %reYearBC",NORM_VALUE="%normYearPrefix(group(3))%normYearBC(group(4))",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="x_date_historic_1c-BCADhint",EXTRACTION="(%reApproximate )?%reYearBC%reAndOrTo%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(6))%normYearBC(group(3))",NORM_MOD="%normApprox4Dates(group(2))",OFFSET="group(0)-group(3)"
-
-// historic dates; month granularity
-// EXAMPLE historic_2a-BCADhint: March 190 BC (1- to 4-digit year)
-// EXAMPLE historic_2b: March 190 (3-digit year)
-// EXAMPLE historic_2c: (in) March 90 (2-digit year)
-// EXAMPLE historic_2d: March of 90 (2-digit year)
-RULENAME="date_historic_2a-BCADhint",EXTRACTION="(%reApproximate )?(%reMonthLong|%reMonthShort)( of | )%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(8))%normYearBC(group(7))-%normMonth(group(3))",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="date_historic_2b",EXTRACTION="(%reApproximate )?(%reMonthLong|%reMonthShort)( of | )([\d][\d][\d])",NORM_VALUE="%normYearBC(group(7))-%normMonth(group(3))",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="date_historic_2c",EXTRACTION="([Ii]n )(%reMonthLong|%reMonthShort)( of | )%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(6)-%normMonth(group(2))",OFFSET="group(2)-group(6)"
-RULENAME="date_historic_2d",EXTRACTION="(%reMonthLong|%reMonthShort)( of )%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(5)-%normMonth(group(1))"
-
-// historic dates; day granularity
-// EXAMPLE historic_3a-BCADhint: March 29, 190 BC (1- to 4-digit year)
-// EXAMPLE historic_3b-BCADhint: 29 March 190 BC (1- to 4-digit year)
-// EXAMPLE historic_3c-BCADhint: 29th of March 190 BC (1- to 4-digit year)
-// EXAMPLE historic_3d-BCADhint: March 29, 190 (3-digit year)
-// EXAMPLE historic_3e-BCADhint: March 29, 90 (2-digit year)
-RULENAME="date_historic_3a-BCADhint",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayNumberTh|%reDayNumber)[\s]?,? %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(8))%normYearBC(group(7))-%normMonth(group(1))-%normDay(group(4))"
-RULENAME="date_historic_3b-BCADhint",EXTRACTION="%reDayNumber (%reMonthLong|%reMonthShort)([\s]?,)? %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(7))%normYearBC(group(6))-%normMonth(group(2))-%normDay(group(1))"
-RULENAME="date_historic_3c-BCADhint",EXTRACTION="(%reDayWordTh|%reDayNumberTh|%reDayNumber) (of) (%reMonthLong|%reMonthShort) %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(10))%normYearBC(group(9))-%normMonth(group(6))-%normDay(group(1))"
-RULENAME="date_historic_3d",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayNumberTh|%reDayNumber)[\s]?,? ([\d\d\d])",NORM_VALUE="%normYearBC(group(7))-%normMonth(group(1))-%normDay(group(4))"
-RULENAME="date_historic_3e",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayNumberTh|%reDayNumber)(,)? ([\d\d])",NORM_VALUE="UNDEF-centurygroup(8)-%normMonth(group(1))-%normDay(group(4))"
-
-// historic dates; season granularity
-// EXAMPLE historic_4a-BCADhint: summer of 190 BC (1- to 4-digit year)
-RULENAME="date_historic_4a-BCADhint",EXTRACTION="(%reApproximate )?(the )?%reSeason( of | )%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(7))%normYearBC(group(6))-%normSeason(group(4))",NORM_MOD="%normApprox4Dates(group(2))"
-
-// historic dates; century granularity
-// EXAMPLE date_historic_5a-BCADhint: the 2nd century BC
-// EXAMPLE date_historic_5b-BCADhint: beginning of the 2nd century BC
-// EXAMPLE date_historic_5c-BCADhint: 2nd or 3rd century BC (find "2nd century BC")
-// EXAMPLE date_historic_5d-BCADhint: beginning of the 2nd or 3rd century BC (find "beginning 2nd century BC")
-RULENAME="date_historic_5a-BCADhint",EXTRACTION="([Tt]he )?(%reDayNumberTh|%reDayWordTh) [Cc]entur(y|ies)( %reYearPrefix)",NORM_VALUE="%normYearPrefix(group(7))%normDay(%SUM%(%normDay(group(2)),-1))"
-RULENAME="date_historic_5b-BCADhint",EXTRACTION="%rePartWords( the)? (%reDayNumberTh|%reDayWordTh) [Cc]entur(y|ies)( %reYearPrefix)",NORM_VALUE="%normYearPrefix(group(8))%normDay(%SUM%(%normDay(group(3)),-1))",NORM_MOD="%normPartWords(group(1))"
-RULENAME="date_historic_5c-BCADhint",EXTRACTION="(([Tt]he )?(%reDayNumberTh|%reDayWordTh))%reAndOrTo(the )?(%reDayNumberTh|%reDayWordTh) [Cc]entur(y|ies)( %reYearPrefix)",NORM_VALUE="%normYearPrefix(group(13))%normDay(%SUM%(%normDay(group(3)),-1))",OFFSET="group(1)-group(1)"
-RULENAME="date_historic_5d-BCADhint",EXTRACTION="(%rePartWords( the)? (%reDayNumberTh|%reDayWordTh))%reAndOrTo(the )?(%reDayNumberTh|%reDayWordTh) [Cc]entur(y|ies)( %reYearPrefix)",NORM_VALUE="%normYearPrefix(group(14))%normDay(%SUM%(%normDay(group(4)),-1))",OFFSET="group(1)-group(1)",NORM_MOD="%normPartWords(group(2))"
-
-// historic dates; decade granularity
-// EXAMPLE date_historic_6a-BCADhint: 1990s BC
-// EXAMPLE date_historic_6b-BCADhint: 190s BC
-// EXAMPLE date_historic_6c-BCADhint: 90s BC
-RULENAME="date_historic_6a-BCADhint",EXTRACTION="(%rePartWords )?([Tt]he )?(\d\d\d0)[']?[s] %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))%SUBSTRING%(group(4),0,3)",NORM_MOD="%normPartWords(group(2))"
-RULENAME="date_historic_6b-BCADhint",EXTRACTION="(%rePartWords )?([Tt]he )?(\d\d0)[']?[s] %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))0%SUBSTRING%(group(4),0,2)",NORM_MOD="%normPartWords(group(2))"
-RULENAME="date_historic_6c-BCADhint",EXTRACTION="(%rePartWords )?([Tt]he )?(\d0)[']?[s] %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))00%SUBSTRING%(group(4),0,1)",NORM_MOD="%normPartWords(group(2))"
-
-// historic dates; year granularity (no AD / BC hints)
-// EXAMPLE date_historic_7a: (in) 190 (3-digit year)
-// EXAMPLE date_historic_7b: (in) 190(,) (3-digit year)
-// EXAMPLE date_historic_7c: (newline)190(newline) (2- to 4-digit year)
-// EXAMPLE date_historic_7d: year of 90 (2-digit year)
-// EXAMPLE date_historic_7e: year of 190 (3-digit year)
-RULENAME="date_historic_7a",EXTRACTION="(\b[Ii]n) (\d\d\d)",NORM_VALUE="%normYearBC(group(2))",OFFSET="group(2)-group(2)"
-RULENAME="date_historic_7b",EXTRACTION="(\b[Ii]n) (\d\d\d)(,)",NORM_VALUE="%normYearBC(group(2))",OFFSET="group(2)-group(2)"
-RULENAME="date_historic_7c",EXTRACTION="\A(\d\d[\d]?[\d]?)\Z",NORM_VALUE="%normYearBC(group(1))"
-RULENAME="date_historic_7d",EXTRACTION="([Tt]he )?(year) (of) (\d\d)",NORM_VALUE="UNDEF-centurygroup(4)"
-RULENAME="date_historic_7e",EXTRACTION="([Tt]he )?(year) (of) (\d\d\d)",NORM_VALUE="%normYearBC(group(4))"
-
-// historic dates; 2-digit year granularity (no AD / BC hints)
-// EXAMPLE date_historic_8a: (in) 90(,) (2-digit year)
-// EXAMPLE date_historic_8b: (in) 90 (2-digit year)
-RULENAME="date_historic_8a",EXTRACTION="(\b[Ii]n) (\d\d)(,)",NORM_VALUE="UNDEF-centurygroup(2)",OFFSET="group(2)-group(2)"
-RULENAME="date_historic_8b",EXTRACTION="(\b[Ii]n) (\d\d)",NORM_VALUE="UNDEF-centurygroup(2)",OFFSET="group(2)-group(2)"
-
-// historic dates; negative rules
-// EXAMPLE date_historic_0a: in 90 cases (2- to 4-digit year)
-// EXAMPLE date_historic_0b: in 90 nice cases (2- to 4-digit year)
-// EXAMPLE date_historic_0c: in 90 nice law cases (2- to 4-digit year)
-// EXAMPLE date_historic_0d: in 90 percent (2- to 4-digit year)
-RULENAME="date_historic_0a_negative",EXTRACTION="(\b[Ii]n) (%reYearBC )([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(4):NNS:",OFFSET="group(2)-group(2)"
-RULENAME="date_historic_0b_negative",EXTRACTION="(\b[Ii]n) (%reYearBC )([\S]+) ([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(4):JJ:group(5):NNS:",OFFSET="group(2)-group(2)"
-RULENAME="date_historic_0c_negative",EXTRACTION="(\b[Ii]n) (%reYearBC )([\S]+) ([\S]+) ([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(4):JJ:group(5):NN:group(6):NNS:",OFFSET="group(2)-group(2)"
-RULENAME="date_historic_0d_negative",EXTRACTION="(\b[Ii]n) (%reYearBC )(kilometers?|miles?|foot|feet|dollars?|percents?|millions?|mi|ft|km|%|\$)",NORM_VALUE="REMOVE"
-
////////////////////
// POSITIVE RULES //
////////////////////
@@ -122,9 +35,9 @@ RULENAME="date_r0h",EXTRACTION="%reDayNumber[\.]%reMonthNumber[\.]%reYear4Digit"
// EXAMPLE r1a_3: Feb. 25, 2009, Monday
// EXAMPLE r1b_1: 25 February 2009
// EXAMPLE r1c_1: 25 of February 2009
-RULENAME="date_r1a",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayNumberTh|%reDayNumber)[\s]?,? %reYear4Digit(, %reWeekday)?",NORM_VALUE="group(7)-%normMonth(group(1))-%normDay(group(4))"
-RULENAME="date_r1b",EXTRACTION="([Tt]he )?(%reDayWordTh|%reDayNumberTh|%reDayNumber) (%reMonthLong|%reMonthShort)([\s]?,)? %reYear4Digit",NORM_VALUE="group(10)-%normMonth(group(6))-%normDay(group(2))"
-RULENAME="date_r1c",EXTRACTION="([Tt]he )?(%reDayWordTh|%reDayNumberTh|%reDayNumber) (of) (%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(10)-%normMonth(group(7))-%normDay(group(2))"
+RULENAME="date_r1a",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh)\s?,? %reYear4Digit(?:, %reWeekday)?",NORM_VALUE="group(3)-%normMonth(group(1))-%normDay(group(2))"
+RULENAME="date_r1b",EXTRACTION="(?:[Tt]he )?%(reDayNumber|reDayNumberTh|reDayWordTh) %(reMonthLong|reMonthShort)\s?,? %reYear4Digit",NORM_VALUE="group(3)-%normMonth(group(2))-%normDay(group(1))"
+RULENAME="date_r1c",EXTRACTION="(?:[Tt]he )?%(reDayNumber|reDayNumberTh|reDayWordTh) of %(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(3)-%normMonth(group(2))-%normDay(group(1))"
RULENAME="date_r1d",EXTRACTION="%reDayNumber[‐–-]%reMonthShort[‐–-]%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(3)-%normMonth(group(2))-%normDay(group(1))"
// date_r2
@@ -138,15 +51,15 @@ RULENAME="date_r1d",EXTRACTION="%reDayNumber[‐–-]%reMonthShort[‐–-]%reYe
// EXAMPLE r2c_3: 19th of November
// EXAMPLE r2d_1: 3 to 6 May (find May 3)
// EXAMPLE r2e_2: 3 to 6 May 2004 (find May 3, 2004)
-RULENAME="date_r2a",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(4))"
-RULENAME="date_r2b",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber)%reAndOrTo(%reDayWordTh|%reDayNumberTh|%reDayNumber)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(9))",OFFSET="group(9)-group(9)"
-RULENAME="date_r2c",EXTRACTION="([Tt]he )?(%reDayWordTh|%reDayNumberTh|%reDayNumber)( of | )(%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(7))-%normDay(group(2))"
-RULENAME="date_r2d",EXTRACTION="(%reDayWordTh|%reDayNumberTh|%reDayNumber[\.]?)%reAndOrTo(%reDayWordTh|%reDayNumberTh|%reDayNumber[.]?) (%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(10))-%normDay(group(1))",OFFSET="group(1)-group(1)"
-RULENAME="date_r2e",EXTRACTION="(%reDayWordTh|%reDayNumberTh|%reDayNumber[\.]?)%reAndOrTo(%reDayWordTh|%reDayNumberTh|%reDayNumber[.]?) (%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(13)-%normMonth(group(10))-%normDay(group(1))",OFFSET="group(1)-group(1)"
+RULENAME="date_r2a",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(2))"
+RULENAME="date_r2b",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh)%reAndOrTo%(reDayNumber|reDayNumberTh|reDayWordTh)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(4))",OFFSET="group(4)-group(4)"
+RULENAME="date_r2c",EXTRACTION="(?:[Tt]he )?%(reDayNumber|reDayNumberTh|reDayWordTh) (?:of )?%(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(2))-%normDay(group(1))"
+RULENAME="date_r2d",EXTRACTION="%(reDayNumber|reDayNumberTh|reDayWordTh)%reAndOrTo%(reDayNumber|reDayNumberTh|reDayWordTh) %reMonthShort",NORM_VALUE="UNDEF-year-%normMonth(group(4))-%normDay(group(1))",OFFSET="group(1)-group(1)"
+RULENAME="date_r2e",EXTRACTION="%(reDayNumber|reDayNumberTh|reDayWordTh)%reAndOrTo%(reDayNumber|reDayNumberTh|reDayWordTh) %reMonthShort %reYear4Digit",NORM_VALUE="group(5)-%normMonth(group(4))-%normDay(group(1))",OFFSET="group(1)-group(1)"
// EXAMPLE r2a2_1: January 19th of that year
-// EXAMPLE r2b2_1: 19th of January of the same year
-RULENAME="date_r2a2",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber) of (that|the same) year",NORM_VALUE="UNDEF-REF-year-PLUS-0-%normMonth(group(1))-%normDay(group(4))"
-RULENAME="date_r2c2",EXTRACTION="([Tt]he )?(%reDayWordTh|%reDayNumberTh|%reDayNumber)( of | )(%reMonthLong|%reMonthShort) of (that|the same) year",NORM_VALUE="UNDEF-REF-year-PLUS-0-%normMonth(group(7))-%normDay(group(2))"
+// EXAMPLE r2c2_1: 19th of January of the same year
+RULENAME="date_r2a2",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh) of (?:that|the same) year",NORM_VALUE="UNDEF-REF-year-PLUS-0-%normMonth(group(1))-%normDay(group(2))"
+RULENAME="date_r2c2",EXTRACTION="(?:[Tt]he )?%(reDayNumber|reDayNumberTh|reDayWordTh) (?:of )?%(reMonthLong|reMonthShort) of (?:that|the same) year",NORM_VALUE="UNDEF-REF-year-PLUS-0-%normMonth(group(2))-%normDay(group(1))"
// date_r3
@@ -154,14 +67,14 @@ RULENAME="date_r2c2",EXTRACTION="([Tt]he )?(%reDayWordTh|%reDayNumberTh|%reDayNu
// EXAMPLE r3a_2: Monday, Oct 12
// EXAMPLE r3b_1: Friday October 13 2009
// EXAMPLE r3b_2: Monday, October 12th 2009
-RULENAME="date_r3a",EXTRACTION="%reWeekday[,]? (%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber)",NORM_VALUE="UNDEF-year-%normMonth(group(2))-%normDay(group(5))"
-RULENAME="date_r3b",EXTRACTION="%reWeekday[,]? (%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber)[,]? %reYear4Digit",NORM_VALUE="group(9)-%normMonth(group(2))-%normDay(group(5))"
+RULENAME="date_r3a",EXTRACTION="%reWeekday,? %(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh)",NORM_VALUE="UNDEF-year-%normMonth(group(2))-%normDay(group(3))"
+RULENAME="date_r3b",EXTRACTION="%reWeekday,? %(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh),? %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(2))-%normDay(group(3))"
// date_r4
// EXAMPLE r4a_1: September 14 and 18, 2010 (find September 14 2010)
// EXAMPLE r4b_1: September 14 and 18, 2010 (find September 18 2010)
-RULENAME="date_r4a",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayNumberTh|%reDayNumber)%reAndOrTo(%reDayNumberTh|%reDayNumber)[,]? %reYear4Digit",NORM_VALUE="group(11)-%normMonth(group(1))-%normDay(group(4))",OFFSET="group(0)-group(4)"
-RULENAME="date_r4b",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayNumberTh|%reDayNumber)%reAndOrTo(%reDayNumberTh|%reDayNumber)[,]? %reYear4Digit",NORM_VALUE="group(11)-%normMonth(group(1))-%normDay(group(8))",OFFSET="group(8)-group(11)"
+RULENAME="date_r4a",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh)%reAndOrTo%(reDayNumber|reDayNumberTh),? %reYear4Digit",NORM_VALUE="group(5)-%normMonth(group(1))-%normDay(group(2))",OFFSET="group(0)-group(2)"
+RULENAME="date_r4b",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh)%reAndOrTo%(reDayNumber|reDayNumberTh),? %reYear4Digit",NORM_VALUE="group(5)-%normMonth(group(1))-%normDay(group(4))",OFFSET="group(4)-group(5)"
// date_r5
// EXAMPLE r5a_1: tomorrow
@@ -178,7 +91,7 @@ RULENAME="date_r5d",EXTRACTION="%rePartWords %reWeekday",NORM_VALUE="UNDEF-day-%
//////////////////////
// date_r6
// EXAMPLE r6a_1: the weekend
-RULENAME="date_r61",EXTRACTION="(the|that) weekend",NORM_VALUE="UNDEF-last-week-WE"
+RULENAME="date_r61",EXTRACTION="(?:the|that) weekend",NORM_VALUE="UNDEF-last-week-WE"
///////////////////////
// MONTH GRANULARITY //
@@ -188,16 +101,17 @@ RULENAME="date_r61",EXTRACTION="(the|that) weekend",NORM_VALUE="UNDEF-last-week-
// EXAMPLE r7a_2: Nov. 2001
// EXAMPLE r7a_3: February of 1999
// EXAMPLE r7b_1: May and June 2011 (find May 2001)
-RULENAME="date_r7a",EXTRACTION="(%reMonthLong|%reMonthShort)( of | )%reYear4Digit",NORM_VALUE="group(5)-%normMonth(group(1))"
-RULENAME="date_r7b",EXTRACTION="(%reMonthLong|%reMonthShort)( of | )%reNumWordTeen( |-)%reNumWord2D",NORM_VALUE="%normDurationNumber(group(5))%normDurationNumber(group(7))-%normMonth(group(1))"
-RULENAME="date_r7c",EXTRACTION="(%reMonthLong|%reMonthShort) (and|or|to|until) (%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(8)-%normMonth(group(1))",OFFSET="group(1)-group(1)"
+RULENAME="date_r7a",EXTRACTION="%(reMonthLong|reMonthShort) (?:of )?%reYear4Digit",NORM_VALUE="group(2)-%normMonth(group(1))"
+RULENAME="date_r7b",EXTRACTION="%(reMonthLong|reMonthShort) (?:of )?%reNumWordTeen[ -]%reNumWord2D",NORM_VALUE="%normDurationNumber(group(2))%normDurationNumber(group(3))-%normMonth(group(1))"
+RULENAME="date_r7c",EXTRACTION="%(reMonthLong|reMonthShort)%reAndOrTo%(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(1))",OFFSET="group(1)-group(1)"
+RULENAME="date_r7d",EXTRACTION="%(reMonthLong|reMonthShort)%reAndOrTo%(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(3))",OFFSET="group(3)-group(4)"
// date_r8
// EXAMPLE r8a_1: November next year
// EXAMPLE r8a_2: May last year
// EXAMPLE
-RULENAME="date_r8a",EXTRACTION="%reMonthLong (the )?%reThisNextLast year",NORM_VALUE="UNDEF-%normThisNextLast(group(3))-year-%normMonth(group(1))"
-RULENAME="date_r8b",EXTRACTION="%reMonthLong of (that|the same) year",NORM_VALUE="UNDEF-REF-year-MINUS-0-%normMonth(group(1))"
+RULENAME="date_r8a",EXTRACTION="%reMonthLong (?:the )?%reThisNextLast year",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-year-%normMonth(group(1))"
+RULENAME="date_r8b",EXTRACTION="%reMonthLong of (?:that|the same) year",NORM_VALUE="UNDEF-REF-year-MINUS-0-%normMonth(group(1))"
////////////////////////
// SEASON GRANULARITY //
@@ -207,10 +121,10 @@ RULENAME="date_r8b",EXTRACTION="%reMonthLong of (that|the same) year",NORM_VALUE
// EXAMPLE r9b_1: winter 2001
// EXAMPLE r9b_1: winter of 2001
// EXAMPLE r9c_1: summer of 69
-RULENAME="date_r9a",EXTRACTION="(%rePartWords |[Tt]he )?%reSeason",NORM_VALUE="UNDEF-year-%normSeason(group(3))",NORM_MOD="%normPartWords(group(2))"
-RULENAME="date_r9b",EXTRACTION="(%rePartWords |[Tt]he )?%reSeason( of | )%reYear4Digit",NORM_VALUE="group(5)-%normSeason(group(3))",NORM_MOD="%normPartWords(group(2))"
-RULENAME="date_r9c",EXTRACTION="(%rePartWords |[Tt]he )?%reSeason( of | )%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(5)-%normSeason(group(3))",NORM_MOD="%normPartWords(group(2))"
-RULENAME="date_r9d",EXTRACTION="(%rePartWords |[Tt]he )?%reSeason( of | )%reNumWordTeen( |-)%reNumWord2D",NORM_VALUE="%normDurationNumber(group(5))%normDurationNumber(group(7))-%normSeason(group(3))",NORM_MOD="%normPartWords(group(2))"
+RULENAME="date_r9a",EXTRACTION="(?:%rePartWords |[Tt]he |)%reSeason",NORM_VALUE="UNDEF-year-%normSeason(group(2))",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_r9b",EXTRACTION="(?:%rePartWords |[Tt]he |)%reSeason (?:of )?%reYear4Digit",NORM_VALUE="group(3)-%normSeason(group(2))",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_r9c",EXTRACTION="(?:%rePartWords |[Tt]he |)%reSeason (?:of )?'?%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(3)-%normSeason(group(2))",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_r9d",EXTRACTION="(?:%rePartWords |[Tt]he |)%reSeason (?:of )?%reNumWordTeen[ -]%reNumWord2D",NORM_VALUE="%normDurationNumber(group(3))%normDurationNumber(group(4))-%normSeason(group(2))",NORM_MOD="%normPartWords(group(1))"
//////////////////////////////
@@ -220,9 +134,10 @@ RULENAME="date_r9d",EXTRACTION="(%rePartWords |[Tt]he )?%reSeason( of | )%reNumW
// EXAMPLE r10a_1: the third quarter of 2001
// EXAMPLE r10b_1: the second half
// EXAMPLE r10c_1: the 2001 third quarter
-RULENAME="date_r10a",EXTRACTION="([Tt]he )?%rePartOfYear( of | )%reYear4Digit",NORM_VALUE="group(4)-%normPartOfYear(group(2))"
-RULENAME="date_r10b",EXTRACTION="([Tt]he )?%rePartOfYear",NORM_VALUE="UNDEF-year-%normPartOfYear(group(2))"
-RULENAME="date_r10c",EXTRACTION="([Tt]he )?%reYear4Digit %rePartOfYear",NORM_VALUE="group(2)-%normPartOfYear(group(3))"
+RULENAME="date_r10a",EXTRACTION="(?:[Tt]he )?%rePartOfYear (?:of )?%reYear4Digit",NORM_VALUE="group(2)-%normPartOfYear(group(1))"
+// TODO: causes many false positives from sports, but helps with finance
+RULENAME="date_r10b",EXTRACTION="(?:[Tt]he )?%rePartOfYear",NORM_VALUE="UNDEF-year-%normPartOfYear(group(1))"
+RULENAME="date_r10c",EXTRACTION="(?:[Tt]he )?%reYear4Digit %rePartOfYear",NORM_VALUE="group(1)-%normPartOfYear(group(2))"
// date_r11
// EXAMPLE r11a_1: this year's third quarter
@@ -230,8 +145,8 @@ RULENAME="date_r10c",EXTRACTION="([Tt]he )?%reYear4Digit %rePartOfYear",NORM_VAL
// EXAMPLE r11b_1: the year-earlier first half
// EXAMPLE r11c_1: the second half of this year
RULENAME="date_r11a",EXTRACTION="%reThisNextLast year's %rePartOfYear",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-year-%normPartOfYear(group(2))"
-RULENAME="date_r11b",EXTRACTION="[Tt]he (year-earlier|year-ago) %rePartOfYear",NORM_VALUE="UNDEF-last-year-%normPartOfYear(group(2))"
-RULENAME="date_r11c",EXTRACTION="([Tt]he )?%rePartOfYear of %reThisNextLast year",NORM_VALUE="UNDEF-%normThisNextLast(group(3))-year-%normPartOfYear(group(2))"
+RULENAME="date_r11b",EXTRACTION="[Tt]he (?:year-earlier|year-ago) %rePartOfYear",NORM_VALUE="UNDEF-last-year-%normPartOfYear(group(1))"
+RULENAME="date_r11c",EXTRACTION="(?:[Tt]he )?%rePartOfYear of %reThisNextLast year",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-year-%normPartOfYear(group(1))"
//////////////////////
// YEAR GRANULARITY //
@@ -241,12 +156,14 @@ RULENAME="date_r11c",EXTRACTION="([Tt]he )?%rePartOfYear of %reThisNextLast year
// EXAMPLE r12b_1: 1850-58 (find: 1858)
// EXAMPLE r12c_1: nineteen ninety-one
// EXAMPLE r12d_1: two-thousand ten
-RULENAME="date_r12a",EXTRACTION="(the year )?%reYear4Digit",NORM_VALUE="group(2)"
-//RULENAME="date_r12b",EXTRACTION="%reYear4Digit(-|–| and )%reYear2Digit",NORM_VALUE="%SUBSTRING%(group(1),0,2)group(3)",OFFSET="group(3)-group(3)"
+RULENAME="date_r12a",EXTRACTION="(?:the year )?%reYear4Digit",NORM_VALUE="group(1)"
+//RULENAME="date_r12b",EXTRACTION="%reYear4Digit(?:-|–| and )%reYear2Digit",NORM_VALUE="%SUBSTRING%(group(1),0,2)group(2)",OFFSET="group(2)-group(2)"
RULENAME="date_r12b",EXTRACTION="%reYear4Digit%reAndOrTo%reYear2Digit",NORM_VALUE="%SUBSTRING%(group(1),0,2)group(3)",OFFSET="group(3)-group(3)"
-RULENAME="date_r12c",EXTRACTION="%reNumWordTeen( |-)%reNumWord2D",NORM_VALUE="%normDurationNumber(group(1))%normDurationNumber(group(3))"
-RULENAME="date_r12d",EXTRACTION="two( |-)thousand( and)? (%reNumWord2D|%reNumWord1D)",NORM_VALUE="20%normDurationNumber(group(3))"
-RULENAME="date_r12e",EXTRACTION="[Tt]he year two( |-)thousand",NORM_VALUE="2000"
+RULENAME="date_r12c",EXTRACTION="%reNumWordTeen[ -]%reNumWord2D",NORM_VALUE="%normDurationNumber(group(1))%normDurationNumber(group(2))"
+RULENAME="date_r12d",EXTRACTION="two[ -]thousand(?: and)? %(reNumWord1D|reNumWord2D)",NORM_VALUE="20%normDurationNumber(group(1))"
+RULENAME="date_r12e",EXTRACTION="[Tt]he year two[ -]thousand",NORM_VALUE="2000"
+RULENAME="date_r12f1",EXTRACTION="%reYear4Digit%reAndOrTo%reYear4Digit",NORM_VALUE="group(1)",OFFSET="group(1)-group(1)"
+RULENAME="date_r12f2",EXTRACTION="%reYear4Digit%reAndOrTo%reYear4Digit",NORM_VALUE="group(3)",OFFSET="group(3)-group(3)"
////////////////////////
// DECADE GRANULARITY //
@@ -256,30 +173,30 @@ RULENAME="date_r12e",EXTRACTION="[Tt]he year two( |-)thousand",NORM_VALUE="2000"
// EXAMPLE r13b_1: the 90s
// EXAMPLE r13c_1: the seventies
// EXAMPLE r13d_1: the nineteen seventies
-RULENAME="date_r13a",EXTRACTION="([Tt]he )?(\d\d\d0)[']?[s]",NORM_VALUE="%SUBSTRING%(group(2),0,3)"
-RULENAME="date_r13b",EXTRACTION="([Tt]he )?[']?(\d0)[']?[s]",NORM_VALUE="19%SUBSTRING%(group(2),0,1)"
-RULENAME="date_r13c",EXTRACTION="([Tt]he )?%reDecadeWord",NORM_VALUE="19%normDecadeWord(group(2))"
-RULENAME="date_r13d",EXTRACTION="([Tt]he )?%reNumWordTeen %reDecadeWord",NORM_VALUE="%normDurationNumber(group(2))%normDecadeWord(group(3))"
+RULENAME="date_r13a",EXTRACTION="(?:[Tt]he )?(\d\d\d0)'?s",NORM_VALUE="%SUBSTRING%(group(1),0,3)"
+RULENAME="date_r13b",EXTRACTION="(?:[Tt]he )?'?(\d0)'?s",NORM_VALUE="19%SUBSTRING%(group(1),0,1)"
+RULENAME="date_r13c",EXTRACTION="(?:[Tt]he )?%reDecadeWord",NORM_VALUE="19%normDecadeWord(group(1))"
+RULENAME="date_r13d",EXTRACTION="(?:[Tt]he )?%reNumWordTeen %reDecadeWord",NORM_VALUE="%normDurationNumber(group(1))%normDecadeWord(group(2))"
// date_r14
// EXAMPLE r14a_1: the early 1990s
// EXAMPLE r14b_1: the mid-90s
// EXAMPLE r14c_1: the late seventies
// EXAMPLE r14d_1: the early nineteen seventies
-RULENAME="date_r14a",EXTRACTION="([Tt]he )?%rePartWords[\s]?(\d\d\d0)[']?[s]",NORM_VALUE="%SUBSTRING%(group(3),0,3)",NORM_MOD="%normPartWords(group(2))"
-RULENAME="date_r14b",EXTRACTION="([Tt]he )?%rePartWords[\s]?[']?%reYear2Digit[']?[s]",NORM_VALUE="19%SUBSTRING%(group(3),0,1)",NORM_MOD="%normPartWords(group(2))"
-RULENAME="date_r14c",EXTRACTION="([Tt]he )?%rePartWords[\s]?%reDecadeWord",NORM_VALUE="19%normDecadeWord(group(3))",NORM_MOD="%normPartWords(group(2))"
-RULENAME="date_r14d",EXTRACTION="([Tt]he )?%rePartWords[\s]?%reNumWordTeen %reDecadeWord",NORM_VALUE="%normDurationNumber(group(3))%normDecadeWord(group(4))",NORM_MOD="%normPartWords(group(2))"
+RULENAME="date_r14a",EXTRACTION="(?:[Tt]he )?%rePartWords\s?(\d\d\d0)'?s",NORM_VALUE="%SUBSTRING%(group(2),0,3)",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_r14b",EXTRACTION="(?:[Tt]he )?%rePartWords\s?'?%reYear2Digit'?s",NORM_VALUE="19%SUBSTRING%(group(2),0,1)",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_r14c",EXTRACTION="(?:[Tt]he )?%rePartWords\s?%reDecadeWord",NORM_VALUE="19%normDecadeWord(group(2))",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_r14d",EXTRACTION="(?:[Tt]he )?%rePartWords\s?%reNumWordTeen %reDecadeWord",NORM_VALUE="%normDurationNumber(group(2))%normDecadeWord(group(3))",NORM_MOD="%normPartWords(group(1))"
/////////////////////////
// CENTURY GRANULARITY //
/////////////////////////
//// EXAMPLE r15a_1: the 19th century
//// EXAMPLE r15a_2: the seventh century
-RULENAME="date_r15a",EXTRACTION="([Tt]he )?(%reDayNumberTh|%reDayWordTh) [Cc]entur(y|ies)",NORM_VALUE="%normDay(%SUM%(%normDay(group(2)),-1))"
-RULENAME="date_r15b",EXTRACTION="%rePartWords( the)? (%reDayNumberTh|%reDayWordTh) [Cc]entur(y|ies)",NORM_VALUE="%normDay(%SUM%(%normDay(group(3)),-1))",NORM_MOD="%normPartWords(group(1))"
-RULENAME="date_r15c",EXTRACTION="(([Tt]he )?(%reDayNumberTh|%reDayWordTh))%reAndOrTo(the )?(%reDayNumberTh|%reDayWordTh) [Cc]entur(y|ies)",NORM_VALUE="%normDay(%SUM%(%normDay(group(3)),-1))",OFFSET="group(1)-group(1)"
-RULENAME="date_r15d",EXTRACTION="(%rePartWords( the)? (%reDayNumberTh|%reDayWordTh))%reAndOrTo(the )?(%reDayNumberTh|%reDayWordTh) [Cc]entur(y|ies)?",NORM_VALUE="%normDay(%SUM%(%normDay(group(4)),-1))",OFFSET="group(1)-group(1)",NORM_MOD="%normPartWords(group(2))"
+RULENAME="date_r15a",EXTRACTION="(?:[Tt]he )?%(reDayNumberTh|reDayWordTh) [Cc]entur(?:y|ies)",NORM_VALUE="%normDay(%SUM%(%normDay(group(1)),-1))"
+RULENAME="date_r15b",EXTRACTION="%rePartWords(?: the)? %(reDayNumberTh|reDayWordTh) [Cc]entur(?:y|ies)",NORM_VALUE="%normDay(%SUM%(%normDay(group(2)),-1))",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_r15c",EXTRACTION="(?:[Tt]he )?%(reDayNumberTh|reDayWordTh)%reAndOrTo(?:the )?(?:%rePartWords )?%(reDayNumberTh|reDayWordTh) [Cc]entur(?:y|ies)",NORM_VALUE="%normDay(%SUM%(%normDay(group(1)),-1))",OFFSET="group(0)-group(1)"
+RULENAME="date_r15d",EXTRACTION="%rePartWords(?: the)? %(reDayNumberTh|reDayWordTh)%reAndOrTo(?:the )?%(reDayNumberTh|reDayWordTh) [Cc]entur(?:y|ies)?",NORM_VALUE="%normDay(%SUM%(%normDay(group(2)),-1))",OFFSET="group(0)-group(2)",NORM_MOD="%normPartWords(group(1))"
///////////////////////////////////
// GRANULARITY INDEPENDENT RULES //
@@ -289,12 +206,12 @@ RULENAME="date_r15d",EXTRACTION="(%rePartWords( the)? (%reDayNumberTh|%reDayWord
// EXAMPLE r16b_1: Early 2001
// EXAMPLE r16c_1: the beginning of November 1999
// EXAMPLE r16d_1: the middle of September
-RULENAME="date_r16a",EXTRACTION="(%reMonthLong)",NORM_VALUE="UNDEF-year-%normMonth(group(1))"
+RULENAME="date_r16a",EXTRACTION="%reMonthLong",NORM_VALUE="UNDEF-year-%normMonth(group(1))"
// 2015-03, Jannik: abbreviated month name on its own is quite dangerous
-//RULENAME="date_r16a",EXTRACTION="(%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(1))"
-RULENAME="date_r16b",EXTRACTION="%rePartWords([ ]?)%reYear4Digit",NORM_VALUE="group(3)",NORM_MOD="%normPartWords(group(1))"
-RULENAME="date_r16c",EXTRACTION="%rePartWords([ ]?)(%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(6)-%normMonth(group(3))",NORM_MOD="%normPartWords(group(1))"
-RULENAME="date_r16d",EXTRACTION="%rePartWords([ ]?)(%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(3))",NORM_MOD="%normPartWords(group(1))"
+//RULENAME="date_r16a",EXTRACTION="%(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(1))"
+RULENAME="date_r16b",EXTRACTION="%rePartWords ?%reYear4Digit",NORM_VALUE="group(2)",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_r16c",EXTRACTION="%rePartWords ?%(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(3)-%normMonth(group(2))",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_r16d",EXTRACTION="%rePartWords ?%(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(2))",NORM_MOD="%normPartWords(group(1))"
// date_r17
// EXAMPLE r17a_1: this year
@@ -303,15 +220,15 @@ RULENAME="date_r16d",EXTRACTION="%rePartWords([ ]?)(%reMonthLong|%reMonthShort)"
// EXAMPLE r17d_1: this Monday
// EXAMPLE r17e_1: this summer
// EXAMPLE r17f_1: this day (using UNDEF-REF normalization)
-RULENAME="date_r17a",EXTRACTION="([Tt]he )?%reThisNextLast %reUnit",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-%normUnit(group(3))"
-RULENAME="date_r17b",EXTRACTION="([Tt]he )?%reThisNextLast %reMonthLong",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-%LOWERCASE%(group(3))"
-RULENAME="date_r17c",EXTRACTION="([Tt]he )?%reThisNextLast %reMonthLong %reDayNumber",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-%LOWERCASE%(group(3))-%normDay(group(4))"
-RULENAME="date_r17d",EXTRACTION="([Tt]he )?%reThisNextLast %reWeekday",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-%LOWERCASE%(group(3))"
-RULENAME="date_r17e",EXTRACTION="([Tt]he )?%reThisNextLast %reSeason",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-%normSeason(group(3))"
+RULENAME="date_r17a",EXTRACTION="(?:[Tt]he )?%reThisNextLast %reUnit(?! of)",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-%normUnit(group(2))"
+RULENAME="date_r17b",EXTRACTION="(?:[Tt]he )?%reThisNextLast %reMonthLong(?! of)",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-year-%normMonth(group(2))"
+RULENAME="date_r17c",EXTRACTION="(?:[Tt]he )?%reThisNextLast %reMonthLong %reDayNumber(?! of)",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-year-%normMonth(group(2))-%normDay(group(3))"
+RULENAME="date_r17d",EXTRACTION="(?:[Tt]he )?%reThisNextLast %reWeekday(?! of)",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-week-%normDayInWeek(group(2))"
+RULENAME="date_r17e",EXTRACTION="(?:[Tt]he )?%reThisNextLast %reSeason(?! of)",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-year-%normSeason(group(2))"
RULENAME="date_r17f",EXTRACTION="[Tt]his day",NORM_VALUE="UNDEF-REF-day-PLUS-0"
-RULENAME="date_r17g",EXTRACTION="([Tt]he )?following %reUnit",NORM_VALUE="UNDEF-REF-%normUnit(group(2))-PLUS-1"
-RULENAME="date_r17h",EXTRACTION="([Tt]he |[Tt]hat |[Tt]his )?same (day|month|year)",NORM_VALUE="UNDEF-REF-%normUnit(group(2))-PLUS-0"
-//RULENAME="date_r17i",EXTRACTION="([Tt]he )?previous %reUnit",NORM_VALUE="UNDEF-REF-%normUnit(group(2))-MINUS-1"
+RULENAME="date_r17g",EXTRACTION="(?:[Tt]he )?following %reUnit",NORM_VALUE="UNDEF-REF-%normUnit(group(1))-PLUS-1"
+RULENAME="date_r17h",EXTRACTION="(?:[Tt]he |[Tt]hat |[Tt]his |)same (day|month|year)",NORM_VALUE="UNDEF-REF-%normUnit(group(1))-PLUS-0"
+//RULENAME="date_r17i",EXTRACTION="(?:[Tt]he )?previous %reUnit",NORM_VALUE="UNDEF-REF-%normUnit(group(1))-MINUS-1"
// date_r18
// EXAMPLE r18a_1: the beginning of this year
@@ -319,55 +236,59 @@ RULENAME="date_r17h",EXTRACTION="([Tt]he |[Tt]hat |[Tt]his )?same (day|month|yea
// EXAMPLE r18c_1: the beginning of this November 24
// EXAMPLE r18d_1: the beginning of this Monday
// EXAMPLE r18e_1: the beginning of this summer
-RULENAME="date_r18a",EXTRACTION="([Tt]he )?%rePartWords([ ]?)%reThisNextLast %reUnit",NORM_VALUE="UNDEF-%normThisNextLast(group(4))-%normUnit(group(5))",NORM_MOD="%normPartWords(group(2))"
-RULENAME="date_r18b",EXTRACTION="([Tt]he )?%rePartWords([ ]?)%reThisNextLast %reMonthLong",NORM_VALUE="UNDEF-%normThisNextLast(group(4))-%LOWERCASE%(group(5))",NORM_MOD="%normPartWords(group(2))"
-RULENAME="date_r18c",EXTRACTION="([Tt]he )?%rePartWords([ ]?)%reThisNextLast %reMonthLong %reDayNumber",NORM_VALUE="UNDEF-%normThisNextLast(group(4))-%LOWERCASE%(group(5))-%normDay(group(6))",NORM_MOD="%normPartWords(group(2))"
-RULENAME="date_r18d",EXTRACTION="([Tt]he )?%rePartWords([ ]?)%reThisNextLast %reWeekday",NORM_VALUE="UNDEF-%normThisNextLast(group(4))-%LOWERCASE%(group(5))",NORM_MOD="%normPartWords(group(2))"
-RULENAME="date_r18e",EXTRACTION="([Tt]he )?%rePartWords([ ]?)%reThisNextLast %reSeason",NORM_VALUE="UNDEF-%normThisNextLast(group(4))-%normSeason(group(5))",NORM_MOD="%normPartWords(group(2))"
+RULENAME="date_r18a",EXTRACTION="(?:[Tt]he |)%rePartWords ?%reThisNextLast %reUnit",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-%normUnit(group(3))",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_r18b",EXTRACTION="(?:[Tt]he |)%rePartWords ?%reThisNextLast %reMonthLong",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-year-%normMonth(group(3))",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_r18c",EXTRACTION="(?:[Tt]he |)%rePartWords ?%reThisNextLast %reMonthLong %reDayNumber",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-year-%normMonth(group(3))-%normDay(group(4))",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_r18d",EXTRACTION="(?:[Tt]he |)%rePartWords ?%reThisNextLast %reWeekday",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-week-%normDayInWeek(group(3))",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_r18e",EXTRACTION="(?:[Tt]he |)%rePartWords ?%reThisNextLast %reSeason",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-year-%normSeason(group(3))",NORM_MOD="%normPartWords(group(1))"
// date_r19 (ago)
// EXAMPLE r19a_1: at least several years ago
// EXAMPLE r19b_1: about twenty years ago
// EXAMPLE r19c_1: about 20 years ago
// EXAMPLE r19d_1: a month ago
-RULENAME="date_r19a",EXTRACTION="(%reApproximate )?(several|a couple of|some|a few|many) (%reUnit|minutes|hours)( or so| or more)? (ago|earlier)",NORM_VALUE="PAST_REF"
-RULENAME="date_r19b",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D) (%reUnit|minutes|hours)( or so| or more)? ago",NORM_VALUE="UNDEF-this-%normUnit(group(6))-MINUS-%normDurationNumber(group(3))",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="date_r19c",EXTRACTION="(%reApproximate )?([\d]+) (%reUnit|minutes|hours)( or so| or more)? ago",NORM_VALUE="UNDEF-this-%normUnit(group(4))-MINUS-group(3)",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="date_r19d",EXTRACTION="(%reApproximate )?(an|a) (week-ends?|weekends?|trading days?|days?|months?|weeks?|decades?|century|quarters?|centuries|years?)( or so| or more)? ago",NORM_VALUE="UNDEF-this-%normUnit(group(4))-MINUS-1",NORM_MOD="%normApprox4Dates(group(2))"
+RULENAME="date_r19a",EXTRACTION="(?:%reApproximate )?(?:several|a couple of|some|a few|many) %reUnitFine(?: or so| or more|) (?:ago|earlier)",NORM_VALUE="PAST_REF"
+RULENAME="date_r19b",EXTRACTION="(%reApproximate )?%(reNumWord1D|reNumWord2D) %reUnitFine(?: or so| or more|) ago",NORM_VALUE="UNDEF-this-%normUnit(group(4))-MINUS-%normDurationNumber(group(3))",NORM_MOD="%normApprox4Dates(group(2))"
+RULENAME="date_r19c",EXTRACTION="(%reApproximate )?(\d+) %reUnitFine(?: or so| or more)? ago",NORM_VALUE="UNDEF-this-%normUnit(group(4))-MINUS-group(3)",NORM_MOD="%normApprox4Dates(group(2))"
+RULENAME="date_r19d",EXTRACTION="(%reApproximate )?an? (week-ends?|weekends?|trading days?|days?|months?|weeks?|decades?|century|quarters?|centuries|years?)(?: or so| or more)? ago",NORM_VALUE="UNDEF-this-%normUnit(group(3))-MINUS-1",NORM_MOD="%normApprox4Dates(group(2))"
RULENAME="date_r19e",EXTRACTION="coming %reUnit",NORM_VALUE="FUTURE_REF"
// date_r20 (later)
// EXAMPLE r20a_1: some days later
// EXAMPLE r20b_1: about twenty days later
// EXAMPLE r20c_1: about 20 days later
-// EXAMPLE r20d_1: a year later
-RULENAME="date_r20a",EXTRACTION="(%reApproximate )?(several|a couple of|some|a few|many) (%reUnit|minutes|hours) later",NORM_VALUE="FUTURE_REF"
-RULENAME="date_r20b",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D) (%reUnit|minutes|hours) later",NORM_VALUE="UNDEF-REF-%normUnit(group(6))-PLUS-%normDurationNumber(group(3))",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="date_r20c",EXTRACTION="(%reApproximate )?([\d]+) (%reUnit|minutes|hours) later",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-PLUS-group(3)",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="date_r20d",EXTRACTION="(%reApproximate )?(an|a) (%reUnit) later",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-PLUS-1",NORM_MOD="%normApprox4Dates(group(2))"
+// EXAMPLE r20d_1: a week later
+// EXAMPLE r20f_1: on 30 minutes [something happened]
+// EXAMPLE r20g_1: on approximately thirty minutes [something happened]
+RULENAME="date_r20a",EXTRACTION="(?:%reApproximate )?(?:several|a couple of|some|a few|many) %reUnitFine (?:later|into)",NORM_VALUE="FUTURE_REF"
+RULENAME="date_r20b",EXTRACTION="(?:%reApproximate )?%(reNumWord1D|reNumWord2D) %reUnitFine (?:later|into)",NORM_VALUE="UNDEF-REF-%normUnit(group(3))-PLUS-%normDurationNumber(group(2))",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="date_r20c",EXTRACTION="(?:%reApproximate )?(\d+) %reUnitFine (?:later|into)",NORM_VALUE="UNDEF-REF-%normUnit(group(3))-PLUS-group(2)",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="date_r20d",EXTRACTION="(?:%reApproximate )?an? %reUnitFine (?:later|into)",NORM_VALUE="UNDEF-REF-%normUnit(group(2))-PLUS-1",NORM_MOD="%normApprox4Dates(group(1))"
RULENAME="date_r20e",EXTRACTION="recent %reUnit",NORM_VALUE="PAST_REF"
+RULENAME="date_r20f",EXTRACTION="[Oo]n ((?:%reApproximate )?(\d+) %reUnitFine)",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-PLUS-group(3)",NORM_MOD="%normApprox4Dates(group(2))",GROUP="group(1)-group(1)"
+RULENAME="date_r20g",EXTRACTION="[Oo]n ((?:%reApproximate )?%(reNumWord1D|reNumWord2D) %reUnitFine)",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-PLUS-%normDurationNumber(group(3))",NORM_MOD="%normApprox4Dates(group(2))",GROUP="group(1)-group(1)"
// date_r21 (earlier)
// EXAMPLE r21a_1: twenty days earlier
// EXAMPLE r21b_1: about 20 days earlier
-// EXAMPLE r21c_1: a year earlier
-RULENAME="date_r21a",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D) (%reUnit|minutes|hours) earlier",NORM_VALUE="UNDEF-REF-%normUnit(group(6))-MINUS-%normDurationNumber(group(3))",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="date_r21b",EXTRACTION="(%reApproximate )?([\d]+) (%reUnit|minutes|hours) earlier",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-MINUS-group(3)",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="date_r21c",EXTRACTION="(%reApproximate )?(an|a) (%reUnit) earlier",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-MINUS-1",NORM_MOD="%normApprox4Dates(group(2))"
+// EXAMPLE r21c_1: a week earlier
+RULENAME="date_r21a",EXTRACTION="(?:%reApproximate )?%(reNumWord1D|reNumWord2D) %reUnitFine earlier",NORM_VALUE="UNDEF-REF-%normUnit(group(3))-MINUS-%normDurationNumber(group(2))",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="date_r21b",EXTRACTION="(?:%reApproximate )?(\d+) %reUnitFine earlier",NORM_VALUE="UNDEF-REF-%normUnit(group(3))-MINUS-group(2)",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="date_r21c",EXTRACTION="(?:%reApproximate )?an? %reUnit earlier",NORM_VALUE="UNDEF-REF-%normUnit(group(2))-MINUS-1",NORM_MOD="%normApprox4Dates(group(1))"
// date_r24 (ago/earlier/later normalizing with REFUNIT)
// EXAMPLE r22a_1: a year ago
// EXAMPLE r22b_1: a year later
-RULENAME="date_r22a",EXTRACTION="[Aa] year (ago|earlier)",NORM_VALUE="UNDEF-REFUNIT-year-MINUS-1"
-RULENAME="date_r22b",EXTRACTION="[Aa] year (later)",NORM_VALUE="UNDEF-REFUNIT-year-PLUS-1"
+RULENAME="date_r22a",EXTRACTION="[Aa] year (?:ago|earlier)",NORM_VALUE="UNDEF-REFUNIT-year-MINUS-1"
+RULENAME="date_r22b",EXTRACTION="[Aa] year (?:later)",NORM_VALUE="UNDEF-REFUNIT-year-PLUS-1"
// date_r23
// EXAMPLE r23a_1: the year-earlier first quarter
// EXAMPLE r23b_1: the year-earlier quarter
// EXAMPLE r23c_1: the quarter
-RULENAME="date_r23a",EXTRACTION="([Tt]he )?(year-earlier|year-ago) %rePartOfYear",NORM_VALUE="UNDEF-REF-year-MINUS-1-%normPartOfYear(group(3))"
-RULENAME="date_r23b",EXTRACTION="([Tt]he|[Tt]hat) (year-earlier|year-ago) quarter",NORM_VALUE="UNDEF-REF-quarter-MINUS-4"
-RULENAME="date_r23c",EXTRACTION="([Tt]he|[Tt]hat) quarter",NORM_VALUE="UNDEF-REF-quarter-PLUS-0"
+RULENAME="date_r23a",EXTRACTION="(?:[Tt]he )?(year-earlier|year-ago) %rePartOfYear",NORM_VALUE="UNDEF-REF-year-MINUS-1-%normPartOfYear(group(2))"
+RULENAME="date_r23b",EXTRACTION="(?:[Tt]he|[Tt]hat) (year-earlier|year-ago) quarter",NORM_VALUE="UNDEF-REF-quarter-MINUS-4"
+RULENAME="date_r23c",EXTRACTION="(?:[Tt]he|[Tt]hat) quarter",NORM_VALUE="UNDEF-REF-quarter-PLUS-0"
///////////////////
@@ -382,8 +303,7 @@ RULENAME="date_r23c",EXTRACTION="([Tt]he|[Tt]hat) quarter",NORM_VALUE="UNDEF-REF
RULENAME="date_r24a",EXTRACTION="%reHolidayFix",NORM_VALUE="UNDEF-year-%normHolidayFix(group(1))"
RULENAME="date_r24b",EXTRACTION="%reHolidayFix %reYear4Digit",NORM_VALUE="group(2)-%normHolidayFix(group(1))"
-RULENAME="date_r24c",EXTRACTION="%reHolidayFix %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normHolidayFix(group(1))"
-RULENAME="date_r24d",EXTRACTION="%reHolidayFix '%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normHolidayFix(group(1))"
+RULENAME="date_r24cd",EXTRACTION="%reHolidayFix '?+%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normHolidayFix(group(1))"
//date_r25
//EXAMPLE r25a_1: Eastersunday
@@ -393,8 +313,95 @@ RULENAME="date_r24d",EXTRACTION="%reHolidayFix '%reYear2Digit",NORM_VALUE="UNDEF
RULENAME="date_r25a",EXTRACTION="%reHolidayVar",NORM_VALUE="UNDEF-year-%normHolidayVar(group(1))"
RULENAME="date_r25b",EXTRACTION="%reHolidayVar %reYear4Digit",NORM_VALUE="group(2)-%normHolidayVar(group(1))"
-RULENAME="date_r25c",EXTRACTION="%reHolidayVar %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normHolidayVar(group(1))"
-RULENAME="date_r25d",EXTRACTION="%reHolidayVar '%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normHolidayVar(group(1))"
+RULENAME="date_r25cd",EXTRACTION="%reHolidayVar '?+%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normHolidayVar(group(1))"
+
+
+// Note: rule with "-BCADhint" in the rule name contain explicit BC or AD information.
+// This information is important during the normalization process.
+
+///////////////////
+// History RULES //
+///////////////////
+
+// historic dates; year granularity; with explicit AD / BC hints
+// EXAMPLE historic_1a-BCADhint: 190 BC (1- to 4-digit year)
+// EXAMPLE historic_1b-BCADhint: BC 190 (1- to 4-digit year)
+// EXAMPLE historic_1c-BCADhint: 190 or 180 BC (find "190 BC"; 1- to 4-digit year)
+RULENAME="date_historic_1a-BCADhint",EXTRACTION="(?:%reApproximate )?%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(3))%normYearBC(group(2))",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="date_historic_1b-BCADhint",EXTRACTION="(?:%reApproximate )?%reYearPrefix %reYearBC",NORM_VALUE="%normYearPrefix(group(2))%normYearBC(group(3))",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="x_date_historic_1c-BCADhint",EXTRACTION="(?:%reApproximate )?%reYearBC%reAndOrTo%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))%normYearBC(group(2))",NORM_MOD="%normApprox4Dates(group(1))",OFFSET="group(0)-group(2)"
+
+// historic dates; month granularity
+// EXAMPLE historic_2a-BCADhint: March 190 BC (1- to 4-digit year)
+// EXAMPLE historic_2b: March 190 (3-digit year)
+// EXAMPLE historic_2c: in March 90 (2-digit year)
+// EXAMPLE historic_2d: March of 90 (2-digit year)
+RULENAME="date_historic_2a-BCADhint",EXTRACTION="(?:%reApproximate )?%(reMonthLong|reMonthShort) (?:of )?%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normMonth(group(2))",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="date_historic_2b",EXTRACTION="(?:%reApproximate )?%(reMonthLong|reMonthShort) (?:of )?([\d][\d][\d])",NORM_VALUE="%normYearBC(group(3))-%normMonth(group(2))",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="date_historic_2c",EXTRACTION="[Ii]n %(reMonthLong|reMonthShort) %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normMonth(group(1))",OFFSET="group(1)-group(2)"
+RULENAME="date_historic_2d",EXTRACTION="%(reMonthLong|reMonthShort) of %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normMonth(group(1))"
+
+// historic dates; day granularity
+// EXAMPLE historic_3a-BCADhint: March 29, 190 BC (1- to 4-digit year)
+// EXAMPLE historic_3b-BCADhint: 29 March 190 BC (1- to 4-digit year)
+// EXAMPLE historic_3c-BCADhint: 29th of March 190 BC (1- to 4-digit year)
+// EXAMPLE historic_3d: March 29, 190 (3-digit year)
+// EXAMPLE historic_3e: March 29, 90 (2-digit year)
+RULENAME="date_historic_3a-BCADhint",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh)\s?,? %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normMonth(group(1))-%normDay(group(2))"
+RULENAME="date_historic_3b-BCADhint",EXTRACTION="%reDayNumber %(reMonthLong|reMonthShort)(?:\s?,)? %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normMonth(group(2))-%normDay(group(1))"
+RULENAME="date_historic_3c-BCADhint",EXTRACTION="%(reDayNumber|reDayNumberTh|reDayWordTh) of %(reMonthLong|reMonthShort) %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normMonth(group(2))-%normDay(group(1))"
+RULENAME="date_historic_3d",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh)\s?,? (\d\d\d)",NORM_VALUE="%normYearBC(group(3))-%normMonth(group(1))-%normDay(group(2))"
+RULENAME="date_historic_3e",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh),? (\d\d)",NORM_VALUE="UNDEF-centurygroup(3)-%normMonth(group(1))-%normDay(group(2))"
+
+// historic dates; season granularity
+// EXAMPLE historic_4a-BCADhint: summer of 190 BC (1- to 4-digit year)
+RULENAME="date_historic_4a-BCADhint",EXTRACTION="(?:%reApproximate )?(?:the )?%reSeason (?:of )?%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normSeason(group(2))",NORM_MOD="%normApprox4Dates(group(1))"
+
+// historic dates; century granularity
+// EXAMPLE date_historic_5a-BCADhint: the 2nd century BC
+// EXAMPLE date_historic_5b-BCADhint: beginning of the 2nd century BC
+// EXAMPLE date_historic_5c-BCADhint: 2nd or 3rd century BC (find "2nd century BC")
+// EXAMPLE date_historic_5d-BCADhint: beginning of the 2nd or 3rd century BC (find "beginning 2nd century BC")
+RULENAME="date_historic_5a-BCADhint",EXTRACTION="(?:[Tt]he )?%(reDayNumberTh|reDayWordTh) [Cc]entur(?:y|ies) %reYearPrefix",NORM_VALUE="%normYearPrefix(group(2))%normDay(%SUM%(%normDay(group(1)),-1))"
+RULENAME="date_historic_5b-BCADhint",EXTRACTION="%rePartWords(?: the)? %(reDayNumberTh|reDayWordTh) [Cc]entur(?:y|ies) %reYearPrefix",NORM_VALUE="%normYearPrefix(group(3))%normDay(%SUM%(%normDay(group(2)),-1))",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_historic_5c-BCADhint",EXTRACTION="((?:[Tt]he )?%(reDayNumberTh|reDayWordTh))%reAndOrTo(?:the )?%(reDayNumberTh|reDayWordTh) [Cc]entur(?:y|ies) %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))%normDay(%SUM%(%normDay(group(2)),-1))",OFFSET="group(1)-group(1)"
+RULENAME="date_historic_5d-BCADhint",EXTRACTION="(%rePartWords(?: the)? %(reDayNumberTh|reDayWordTh))%reAndOrTo(?:the )?%(reDayNumberTh|reDayWordTh) [Cc]entur(?:y|ies) %reYearPrefix",NORM_VALUE="%normYearPrefix(group(6))%normDay(%SUM%(%normDay(group(3)),-1))",OFFSET="group(1)-group(1)",NORM_MOD="%normPartWords(group(2))"
+
+// historic dates; decade granularity
+// EXAMPLE date_historic_6a-BCADhint: 1990s BC
+// EXAMPLE date_historic_6b-BCADhint: 190s BC
+// EXAMPLE date_historic_6c-BCADhint: 90s BC
+RULENAME="date_historic_6a-BCADhint",EXTRACTION="(?:%rePartWords )?(?:[Tt]he )?([0-9][0-9][0-9]0)'?s %reYearPrefix",NORM_VALUE="%normYearPrefix(group(3))%SUBSTRING%(group(2),0,3)",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_historic_6b-BCADhint",EXTRACTION="(?:%rePartWords )?(?:[Tt]he )?([0-9][0-9]0)'?s %reYearPrefix",NORM_VALUE="%normYearPrefix(group(3))0%SUBSTRING%(group(2),0,2)",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_historic_6c-BCADhint",EXTRACTION="(?:%rePartWords )?(?:[Tt]he )?([0-9]0)'?s %reYearPrefix",NORM_VALUE="%normYearPrefix(group(3))00%SUBSTRING%(group(2),0,1)",NORM_MOD="%normPartWords(group(1))"
+
+// historic dates; year granularity (no AD / BC hints)
+// EXAMPLE date_historic_7a: (in) 190 (3-digit year)
+// EXAMPLE date_historic_7b: (in) 190(,) (3-digit year)
+// EXAMPLE date_historic_7c: (newline)190(newline) (2- to 4-digit year)
+// EXAMPLE date_historic_7d: year of 90 (2-digit year)
+// EXAMPLE date_historic_7e: year of 190 (3-digit year)
+RULENAME="date_historic_7ab",EXTRACTION="[Ii]n ([0-9][0-9][0-9]),?",NORM_VALUE="%normYearBC(group(1))",OFFSET="group(1)-group(1)"
+//RULENAME="date_historic_7b",EXTRACTION="[Ii]n ([0-9][0-9][0-9]),",NORM_VALUE="%normYearBC(group(1))",OFFSET="group(1)-group(1)"
+//RULENAME="date_historic_7c",EXTRACTION="\A([0-9][0-9][0-9]?[0-9]?)\Z",NORM_VALUE="%normYearBC(group(1))"
+RULENAME="date_historic_7d",EXTRACTION="(?:[Tt]he )?year of ([0-9][0-9])",NORM_VALUE="UNDEF-centurygroup(1)"
+RULENAME="date_historic_7e",EXTRACTION="(?:[Tt]he )?year of ([0-9][0-9][0-9])",NORM_VALUE="%normYearBC(group(1))"
+
+// historic dates; 2-digit year granularity (no AD / BC hints)
+// EXAMPLE date_historic_8a: (in) 90(,) (2-digit year)
+// EXAMPLE date_historic_8b: (in) 90 (2-digit year)
+RULENAME="date_historic_8ab",EXTRACTION="[Ii]n ([0-9][0-9]),?",NORM_VALUE="UNDEF-centurygroup(1)",OFFSET="group(1)-group(1)"
+//RULENAME="date_historic_8b",EXTRACTION="[Ii]n ([0-9][0-9])",NORM_VALUE="UNDEF-centurygroup(2)",OFFSET="group(2)-group(2)"
+
+// historic dates; negative rules
+// EXAMPLE date_historic_0a: in 90 cases (2- to 4-digit year)
+// EXAMPLE date_historic_0b: in 90 nice cases (2- to 4-digit year)
+// EXAMPLE date_historic_0c: in 90 nice law cases (2- to 4-digit year)
+// EXAMPLE date_historic_0d: in 90 percent (2- to 4-digit year)
+RULENAME="date_historic_0a_negative",EXTRACTION="[Ii]n %reYearBC (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):NNS:",OFFSET="group(1)-group(1)"
+RULENAME="date_historic_0b_negative",EXTRACTION="[Ii]n %reYearBC (\S+) (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):JJ:group(3):NNS:",OFFSET="group(1)-group(1)"
+RULENAME="date_historic_0c_negative",EXTRACTION="[Ii]n %reYearBC (\S+) (\S+) (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):JJ:group(3):NN:group(4):NNS:",OFFSET="group(1)-group(1)"
+RULENAME="date_historic_0d_negative",EXTRACTION="[Ii]n %reYearBC (?:kilometers?|miles?|foot|feet|dollars?|percents?|millions?|mi|ft|km|%|\$)",NORM_VALUE="REMOVE"
////////////////////
@@ -408,46 +415,50 @@ RULENAME="date_r25d",EXTRACTION="%reHolidayVar '%reYear2Digit",NORM_VALUE="UNDEF
// EXAMPLE r2b_negative_1: they march the way (if it is a verb)
// EXAMPLE r2c_negative_1: may (if it is a verb)
// EXAMPLE r2d_negative_1: may (or march, fall -- if it is lower case and without any further temporal stuff around it...)
-RULENAME="x_date_r2a_negative",EXTRACTION="(%reMonthLong|%reMonthShort)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):VBP:"
-RULENAME="x_date_r2b_negative",EXTRACTION="(%reMonthLong|%reMonthShort)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):VVP:"
-RULENAME="x_date_r2c_negative",EXTRACTION="(%reMonthLong|%reMonthShort)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):MD:"
-RULENAME="x_date_r2d1_negative",EXTRACTION="(may|march|fall)",NORM_VALUE="REMOVE"
+RULENAME="x_date_r2a_negative",EXTRACTION="%(reMonthLong|reMonthShort)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):VBP:"
+RULENAME="x_date_r2b_negative",EXTRACTION="%(reMonthLong|reMonthShort)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):VVP:"
+RULENAME="x_date_r2c_negative",EXTRACTION="%(reMonthLong|reMonthShort)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):MD:"
+RULENAME="x_date_r2d1_negative",EXTRACTION="(?:may|march|fall)",NORM_VALUE="REMOVE"
RULENAME="x_date_r2d2_negative",EXTRACTION="[Tt]he fall",NORM_VALUE="REMOVE"
-RULENAME="x_date_r2e_negative",EXTRACTION="(March) ([Ff]or|[Aa]gainst|[Tt]o) (the )?([A-Z][\S]+)",NORM_VALUE="REMOVE"
-RULENAME="x_date_r2f_negative",EXTRACTION="([Tt]he )?(Fall) ([Oo]f) (the )?([A-Z][\S]+)",NORM_VALUE="REMOVE"
+RULENAME="x_date_r2e_negative",EXTRACTION="[mM]arch ([Ff]or|[Aa]gainst|[Tt]o) (the )?([A-Z]\S+)",NORM_VALUE="REMOVE"
+RULENAME="x_date_r2f_negative",EXTRACTION="([Tt]he )?[Ff]all [Oo]f (the )?([A-Z]\S+)",NORM_VALUE="REMOVE"
// EXAMPLE r3a_negative_1: 2000 soldiers (four digit number followed by a plural noun)
// EXAMPLE r3b_negative_1: 2000 dead soldiers (four digit number followed by an adjective and a plural noun)
// EXAMPLE r3c_negative_1: 2000 kilometer (four digit number followed a non-temporal unit)
-RULENAME="x_date_r3a_negative",EXTRACTION="%reYear4Digit ([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):NNS:"
-RULENAME="x_date_r3b_negative",EXTRACTION="%reYear4Digit ([\S]+) ([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):JJ:group(3):NNS:"
-RULENAME="x_date_r3c_negative",EXTRACTION="%reYear4Digit(-| )(kilometers?|miles?|foot|feet|dollars?|percents?|millions?|mi|ft|km|%|\$)",NORM_VALUE="REMOVE"
+RULENAME="x_date_r3a_negative",EXTRACTION="%reYear4Digit (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):NNS:"
+RULENAME="x_date_r3b_negative",EXTRACTION="%reYear4Digit (\S+) (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):JJ:group(3):NNS:"
+RULENAME="x_date_r3c_negative",EXTRACTION="%reYear4Digit[- ](?:kilometers?|miles?|foot|feet|dollars?|percents?|millions?|mi|ft|km|%|\$)",NORM_VALUE="REMOVE"
// EXAMPLE r4a_negative: W2000.1920
-RULENAME="x_date_r4a_negative",EXTRACTION="[\S]+%reYear4Digit\.%reYear4Digit",NORM_VALUE="REMOVE"
+RULENAME="x_date_r4a_negative",EXTRACTION="\S+%reYear4Digit\.%reYear4Digit",NORM_VALUE="REMOVE"
// Telephone numbers
-RULENAME="x_date_r5a_negative",EXTRACTION="(\()?\d\d\d(\))? \d\d\d-\d\d\d\d",NORM_VALUE="REMOVE"
+RULENAME="x_date_r5a_negative",EXTRACTION="(?:\()?\d\d\d(\))? \d\d\d-\d\d\d\d",NORM_VALUE="REMOVE"
// NEW NEGATIVE RULES: 2015-03-18 (jannik)
-RULENAME="x_date_r6a_negative",EXTRACTION="([Aa]ssociation|[Dd]ocument|[Nn]umber|[Ss]ection|[Rr]esolution|HR|UNSCR|[Rr]oom|[Cc]all|[Ss]ervice at|[Pp]lan) (%reYear2Digit-)?%reYear4Digit",NORM_VALUE="REMOVE"
+RULENAME="x_date_r6a_negative",EXTRACTION="(?:[Aa]ssociation|[Dd]ocument|[Nn]umber|[Ss]ection|[Rr]esolution|HR|UNSCR|[Rr]oom|[Cc]all|[Ss]ervice at|[Pp]lan) (?:%reYear2Digit-)?%reYear4Digit",NORM_VALUE="REMOVE"
// address
-RULENAME="x_date_r7a_negative",EXTRACTION="%reYear4Digit [A-Z]([\S]+) (Avenue|Street)",NORM_VALUE="REMOVE"
+RULENAME="x_date_r7a_negative",EXTRACTION="%reYear4Digit [A-Z]\S+ (?:Avenue|Street)",NORM_VALUE="REMOVE"
// abbreviations
// NOT ONLY an "A" because this is likely to be a determiner
-RULENAME="x_date_r8a_negative",EXTRACTION="(\b[B-Z]|\b[A-Z][A-Z][A-Z])(-| )%reYear4Digit",NORM_VALUE="REMOVE"
-RULENAME="x_date_r8b_negative",EXTRACTION="(\bA)(-)%reYear4Digit",NORM_VALUE="REMOVE"
+RULENAME="x_date_r8a_negative",EXTRACTION="(?:\b[B-Z]|\b[A-Z][A-Z][A-Z])[- ]%reYear4Digit",NORM_VALUE="REMOVE"
+RULENAME="x_date_r8b_negative",EXTRACTION="(?:\bA)-%reYear4Digit",NORM_VALUE="REMOVE"
// Money
-RULENAME="x_date_r9a_negative",EXTRACTION="([Ee]uro|EUR|Dollar|\$) [\d]+(-[\d]+)?",NORM_VALUE="REMOVE"
+RULENAME="x_date_r9a_negative",EXTRACTION="(?:[Ee]uro|EUR|Dollar|USD|[$€£¥¤]|GPB) \d+(?:-\d+)?",NORM_VALUE="REMOVE"
// Unlikely (PAST|PRESENT|FUTURE)_REF expressions
-//RULENAME="x_date_r10a_negative",EXTRACTION="([Ss]oon after)",NORM_VALUE="REMOVE"
+//RULENAME="x_date_r10a_negative",EXTRACTION="[Ss]oon after",NORM_VALUE="REMOVE"
// Issue # 29 - addressed Sept 16, 2015 (heideltime 2.0)
// EXAMPLE"in his 20s"
-RULENAME="x_date_r11a_negative",EXTRACTION="\b[Ii]n (his|her|their) \d\ds",NORM_VALUE="REMOVE"
+RULENAME="x_date_r11a_negative",EXTRACTION="[Ii]n (?:his|her|their) \d\ds",NORM_VALUE="REMOVE"
+
+// 2017-01-18 false positives: third half-century, sixth half-hour episode
+RULENAME="date_r10a_negative",EXTRACTION="\shalf-%reUnit",NORM_VALUE="REMOVE"
+RULENAME="date_r10b_negative",EXTRACTION="(?:[Tt]hird|[Ff](?:ourth|ifth)|[Ss](?:ixth|eventh)) [Hh]alf",NORM_VALUE="REMOVE"
diff --git a/resources/english/rules/resources_rules_durationrules.txt b/resources/english/rules/resources_rules_durationrules.txt
index 3ab12cda..0e9e71d7 100644
--- a/resources/english/rules/resources_rules_durationrules.txt
+++ b/resources/english/rules/resources_rules_durationrules.txt
@@ -14,20 +14,21 @@
// EXAMPLE r1d-1: less than sixty minutes
// EXAMPLE r1e-1: less than 60 minutes
// EXAMPLE r1f-1: several minutes
-RULENAME="duration_r1a",EXTRACTION="(%reApproximate |[Tt]he )?(%reNumWord2D|%reNumWord1D)( more |-| )%reUnit",NORM_VALUE="P%normDurationNumber(group(3))%normUnit4Duration(group(7))",NORM_MOD="%normApprox4Durations(group(2))"
-RULENAME="duration_r1b1",EXTRACTION="(%reApproximate )?([\d]+)( more | |-)%reUnit",NORM_VALUE="Pgroup(3)%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))"
-RULENAME="duration_r1b2",EXTRACTION="(%reApproximate |[Tt]he )?([\d]+)( more | )%reUnit",NORM_VALUE="Pgroup(3)%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))"
-RULENAME="duration_r1c",EXTRACTION="(%reApproximate )?([Ss]everal|[Aa] couple of|[Ss]ome|[Mm]any|[Aa] few|[Rr]ecent|[Cc]oming) %reUnit",NORM_VALUE="PX%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(2))"
-RULENAME="duration_r1d",EXTRACTION="(%reApproximate |[Tt]he )?(%reNumWord2D|%reNumWord1D)( more | |-)(seconds?|minutes?|hours?)",NORM_VALUE="PT%normDurationNumber(group(3))%normUnit4Duration(group(7))",NORM_MOD="%normApprox4Durations(group(2))"
-RULENAME="duration_r1e1",EXTRACTION="(%reApproximate |[Tt]he )?([\d]+)( more | )(seconds?|minutes?|hours?)",NORM_VALUE="PTgroup(3)%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))"
-RULENAME="duration_r1e2",EXTRACTION="(%reApproximate )?([\d]+)( more | |-)(seconds?|minutes?|hours?)",NORM_VALUE="PTgroup(3)%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))"
-RULENAME="duration_r1f",EXTRACTION="(%reApproximate )?([Ss]everal|[Aa] couple of|[Ss]ome|[Mm]any|[Aa] few|[Rr]ecent|[Cc]oming) (seconds?|minutes?|hours?)",NORM_VALUE="PTX%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(2))"
-RULENAME="duration_r1g1",EXTRACTION="(%reApproximate )(an?)( )%reUnit",NORM_VALUE="P1%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))"
-RULENAME="duration_r1g2",EXTRACTION="([Aa]n?)( )%reUnit",NORM_VALUE="P1%normUnit4Duration(group(3))"
-RULENAME="duration_r1h1",EXTRACTION="(%reApproximate )(an?)( )(second|minute|hour)",NORM_VALUE="PT1%normUnit4Duration(group(5))"
-RULENAME="duration_r1h2",EXTRACTION="([Aa]n?)( )(second|minute|hour)",NORM_VALUE="PT1%normUnit4Duration(group(3))",POS_CONSTRAINT="group(3):NN:"
-RULENAME="duration_r1i1",EXTRACTION="(%reApproximate )?a (hundred) %reUnit",NORM_VALUE="P100%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(2))"
-RULENAME="duration_r1i2",EXTRACTION="(%reApproximate )?%reNumWord1D (hundred) %reUnit",NORM_VALUE="P%normDurationNumber(group(3))00%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))"
+RULENAME="duration_r1a",EXTRACTION="(?:%reApproximate |[Tt]he )?%(reNumWord1D|reNumWord2D)(?: more | |-)%reUnit",NORM_VALUE="P%normDurationNumber(group(2))%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))"
+RULENAME="duration_r1d",EXTRACTION="(?:%reApproximate |[Tt]he )?%(reNumWord1D|reNumWord2D)(?: more | |-)(seconds?|minutes?|hours?)",NORM_VALUE="PT%normDurationNumber(group(2))%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))"
+RULENAME="duration_r1b1",EXTRACTION="(?:%reApproximate )?(\d+)(?: more | |-)%reUnit",NORM_VALUE="Pgroup(2)%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))"
+RULENAME="duration_r1b2",EXTRACTION="(?:%reApproximate )?(\d+)(?: more | |-)(seconds?|minutes?|hours?)",NORM_VALUE="PTgroup(2)%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))"
+RULENAME="duration_r1e1",EXTRACTION="(?:%reApproximate |[Tt]he )?(\d+) (?:more )?%reUnit",NORM_VALUE="Pgroup(2)%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))"
+RULENAME="duration_r1e2",EXTRACTION="(?:%reApproximate |[Tt]he )?(\d+) (?:more )?(seconds?|minutes?|hours?)",NORM_VALUE="PTgroup(2)%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))"
+RULENAME="duration_r1c",EXTRACTION="(?:%reApproximate )?(?:[Ss](?:everal|ome)|[Aa] (?:couple of|few)|[Mm]any|[Rr]ecent|[Cc]oming) %reUnit",NORM_VALUE="PX%normUnit4Duration(group(2))",NORM_MOD="%normApprox4Durations(group(1))"
+RULENAME="duration_r1f",EXTRACTION="(?:%reApproximate )?(?:[Ss](?:everal|ome)|[Aa] (?:couple of|few)|[Mm]any|[Rr]ecent|[Cc]oming) (seconds?|minutes?|hours?)",NORM_VALUE="PTX%normUnit4Duration(group(2))",NORM_MOD="%normApprox4Durations(group(1))"
+
+RULENAME="duration_r1g1",EXTRACTION="%reApproximate an? %reUnit",NORM_VALUE="P1%normUnit4Duration(group(2))",NORM_MOD="%normApprox4Durations(group(1))"
+RULENAME="duration_r1g2",EXTRACTION="%reApproximate an? (second|minute|hour)",NORM_VALUE="PT1%normUnit4Duration(group(2))",NORM_MOD="%normApprox4Durations(group(1))"
+RULENAME="duration_r1h1",EXTRACTION="(?:[Aa]n?) %reUnit",NORM_VALUE="P1%normUnit4Duration(group(1))"
+RULENAME="duration_r1h2",EXTRACTION="(?:[Aa]n?) (second|minute|hour)",NORM_VALUE="PT1%normUnit4Duration(group(1))",POS_CONSTRAINT="group(1):NN:"
+RULENAME="duration_r1i1",EXTRACTION="(?:%reApproximate )?a hundred %reUnit",NORM_VALUE="P100%normUnit4Duration(group(2))",NORM_MOD="%normApprox4Durations(group(1))"
+RULENAME="duration_r1i2",EXTRACTION="(?:%reApproximate )?%reNumWord1D hundred %reUnit",NORM_VALUE="P%normDurationNumber(group(2))00%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))"
// duration_r2
// EXAMPLE r2a-1: at least the last twenty years
@@ -36,37 +37,37 @@ RULENAME="duration_r1i2",EXTRACTION="(%reApproximate )?%reNumWord1D (hundred) %r
// EXAMPLE r2d-1: at least the last twenty minutes
// EXAMPLE r2e-1: at least the last 20 minutes
// EXAMPLE r2f-1: at least the last several minutes
-RULENAME="duration_r2a",EXTRACTION="(%reApproximate )?[Tt]he %reThisNextLast (%reNumWord2D|%reNumWord1D) %reUnit( or so)?",NORM_VALUE="P%normDurationNumber(group(4))%normUnit4Duration(group(7))",NORM_MOD="%normApprox4Durations(group(2))"
-RULENAME="duration_r2b",EXTRACTION="(%reApproximate )?[Tt]he %reThisNextLast ([\d]+) %reUnit( or so)?",NORM_VALUE="Pgroup(4)%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))"
-RULENAME="duration_r2c",EXTRACTION="(%reApproximate )?[Tt]he %reThisNextLast (several|couple of|few) %reUnit( or so)?",NORM_VALUE="PX%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))"
-RULENAME="duration_r2d",EXTRACTION="(%reApproximate )?[Tt]he %reThisNextLast (%reNumWord2D|%reNumWord1D) (seconds?|minutes?|hours?)( or so)?",NORM_VALUE="PT%normDurationNumber(group(4))%normUnit4Duration(group(7))",NORM_MOD="%normApprox4Durations(group(2))"
-RULENAME="duration_r2e",EXTRACTION="(%reApproximate )?[Tt]he %reThisNextLast ([\d]+) (seconds?|minutes?|hours?)( or so)?",NORM_VALUE="PTgroup(4)%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))"
-RULENAME="duration_r2f",EXTRACTION="(%reApproximate )?[Tt]he %reThisNextLast (several|couple of|few) (seconds?|minutes?|hours?)( or so)?",NORM_VALUE="PTX%normUnit4Duration(group(5))",NORM_MOD="%normApprox4Durations(group(2))"
+RULENAME="duration_r2a",EXTRACTION="(?:%reApproximate )?[Tt]he %reThisNextLast %(reNumWord1D|reNumWord2D) %reUnit(?: or so)?",NORM_VALUE="P%normDurationNumber(group(3))%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(1))"
+RULENAME="duration_r2b",EXTRACTION="(?:%reApproximate )?[Tt]he %reThisNextLast (\d+) %reUnit(?: or so)?",NORM_VALUE="Pgroup(3)%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(1))"
+RULENAME="duration_r2c",EXTRACTION="(?:%reApproximate )?[Tt]he %reThisNextLast (?:several|couple of|few) %reUnit(?: or so)?",NORM_VALUE="PX%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))"
+RULENAME="duration_r2d",EXTRACTION="(?:%reApproximate )?[Tt]he %reThisNextLast %(reNumWord1D|reNumWord2D) (seconds?|minutes?|hours?)(?: or so)?",NORM_VALUE="PT%normDurationNumber(group(3))%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(1))"
+RULENAME="duration_r2e",EXTRACTION="(?:%reApproximate )?[Tt]he %reThisNextLast (\d+) (seconds?|minutes?|hours?)(?: or so)?",NORM_VALUE="PTgroup(3)%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(1))"
+RULENAME="duration_r2f",EXTRACTION="(?:%reApproximate )?[Tt]he %reThisNextLast (?:several|couple of|few) (seconds?|minutes?|hours?)(?: or so)?",NORM_VALUE="PTX%normUnit4Duration(group(3))",NORM_MOD="%normApprox4Durations(group(1))"
// duration_r3
// EXAMPLE r3a-1: a three-year period
// EXAMPLE r3b-1: a 300 year period
// EXAMPLE r3c-1: a three-hour period
// EXAMPLE r3d-1: a 300 hour period
-RULENAME="duration_r3a",EXTRACTION="(([Aa]n?|[Tt]he) )?(%reNumWord2D|%reNumWord1D)( |-)%reUnit (period|term)",NORM_VALUE="P%normDurationNumber(group(3))%normUnit4Duration(group(7))"
-RULENAME="duration_r3b",EXTRACTION="(([Aa]n?|[Tt]he) )?([\d]+)( |-)%reUnit (period|term)",NORM_VALUE="Pgroup(3)%normUnit4Duration(group(5))"
-RULENAME="duration_r3c",EXTRACTION="(([Aa]n?|[Tt]he) )?(%reNumWord2D|%reNumWord1D)( |-)(seconds?|minutes?|hours?) (period|term)",NORM_VALUE="PT%normDurationNumber(group(3))%normUnit4Duration(group(7))"
-RULENAME="duration_r3d",EXTRACTION="(([Aa]n?|[Tt]he) )?([\d]+)( |-)(seconds?|minutes?|hours?) (period|term)",NORM_VALUE="PTgroup(3)%normUnit4Duration(group(5))"
+RULENAME="duration_r3a",EXTRACTION="(?:[Aa]n? |[Tt]he |)%(reNumWord1D|reNumWord2D)[ -]%reUnit (?:period|term)",NORM_VALUE="P%normDurationNumber(group(1))%normUnit4Duration(group(2))"
+RULENAME="duration_r3b",EXTRACTION="(?:[Aa]n? |[Tt]he |)(\d+)[ -]%reUnit (?:period|term)",NORM_VALUE="Pgroup(1)%normUnit4Duration(group(2))"
+RULENAME="duration_r3c",EXTRACTION="(?:[Aa]n? |[Tt]he |)%(reNumWord1D|reNumWord2D)[ -](seconds?|minutes?|hours?) (?:period|term)",NORM_VALUE="PT%normDurationNumber(group(1))%normUnit4Duration(group(2))"
+RULENAME="duration_r3d",EXTRACTION="(?:[Aa]n? |[Tt]he |)(\d+)[ -](seconds?|minutes?|hours?) (?:period|term)",NORM_VALUE="PTgroup(1)%normUnit4Duration(group(2))"
// duration_r4
-RULENAME="duration_r4a",EXTRACTION="(([Aa]n?)( |-)%reUnit) after",NORM_VALUE="P1%normUnit4Duration(group(4))",OFFSET="group(1)-group(1)"
-RULENAME="duration_r4b",EXTRACTION="(([Aa]n?)( |-)(seconds?|minutes?|hours?)) after",NORM_VALUE="PT1%normUnit4Duration(group(4))",OFFSET="group(1)-group(1)"
+RULENAME="duration_r4a",EXTRACTION="([Aa]n?[ -]%reUnit) after",NORM_VALUE="P1%normUnit4Duration(group(2))",OFFSET="group(1)-group(1)"
+RULENAME="duration_r4b",EXTRACTION="([Aa]n?[ -](seconds?|minutes?|hours?)) after",NORM_VALUE="PT1%normUnit4Duration(group(2))",OFFSET="group(1)-group(1)"
// duration_r5
// EXAMPLE: r5_a: two and six days (find "two")
-RULENAME="duration_r5a1",EXTRACTION="(%reApproximate )(%reNumWord2D|%reNumWord1D)( to | or | and |-)(%reNumWord2D|%reNumWord1D) %reUnit",NORM_VALUE="P%normDurationNumber(group(3))%normUnit4Duration(group(10))",NORM_MOD="%normApprox4Durations(group(2))",OFFSET="group(1)-group(3)"
-RULENAME="duration_r5a2",EXTRACTION="(%reNumWord2D|%reNumWord1D)( to | or | and |-)(%reNumWord2D|%reNumWord1D) %reUnit",NORM_VALUE="P%normDurationNumber(group(1))%normUnit4Duration(group(8))",OFFSET="group(1)-group(1)"
-RULENAME="duration_r5b1",EXTRACTION="(%reApproximate )([\d]+)( to | or | and |-)([\d]+) %reUnit",NORM_VALUE="Pgroup(3)%normUnit4Duration(group(6))",NORM_MOD="%normApprox4Durations(group(2))",OFFSET="group(1)-group(3)"
-RULENAME="duration_r5b2",EXTRACTION="([\d]+)( to | or | and |-)([\d]+) %reUnit",NORM_VALUE="Pgroup(1)%normUnit4Duration(group(4))",OFFSET="group(1)-group(1)"
-RULENAME="duration_r5c1",EXTRACTION="(%reApproximate )(%reNumWord2D|%reNumWord1D)( to | or | and |-)(%reNumWord2D|%reNumWord1D) (seconds?|minutes?|hours?)",NORM_VALUE="PT%normDurationNumber(group(3))%normUnit4Duration(group(10))",NORM_MOD="%normApprox4Durations(group(2))",OFFSET="group(1)-group(3)"
-RULENAME="duration_r5c2",EXTRACTION="(%reNumWord2D|%reNumWord1D)( to | or | and |-)(%reNumWord2D|%reNumWord1D) (seconds?|minutes?|hours?)",NORM_VALUE="PT%normDurationNumber(group(1))%normUnit4Duration(group(8))",OFFSET="group(1)-group(1)"
-RULENAME="duration_r5d1",EXTRACTION="(%reApproximate )([\d]+)( to | or | and |-)([\d]+) (seconds?|minutes?|hours?)",NORM_VALUE="PTgroup(3)%normUnit4Duration(group(6))",NORM_MOD="%normApprox4Durations(group(2))",OFFSET="group(1)-group(3)"
-RULENAME="duration_r5d2",EXTRACTION="([\d]+)( to | or | and |-)([\d]+) (seconds?|minutes?|hours?)",NORM_VALUE="PTgroup(1)%normUnit4Duration(group(4))",OFFSET="group(1)-group(1)"
+RULENAME="duration_r5a1",EXTRACTION="%reApproximate %(reNumWord1D|reNumWord2D)(?: to | or | and |-)%(reNumWord1D|reNumWord2D) %reUnit",NORM_VALUE="P%normDurationNumber(group(2))%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(1))",OFFSET="group(1)-group(2)"
+RULENAME="duration_r5b1",EXTRACTION="%(reNumWord1D|reNumWord2D)(?: to | or | and |-)%(reNumWord1D|reNumWord2D) %reUnit",NORM_VALUE="P%normDurationNumber(group(1))%normUnit4Duration(group(3))",OFFSET="group(1)-group(1)"
+RULENAME="duration_r5c1",EXTRACTION="%reApproximate (\d+)(?: to | or | and |-)(\d+) %reUnit",NORM_VALUE="Pgroup(2)%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(1))",OFFSET="group(1)-group(2)"
+RULENAME="duration_r5d1",EXTRACTION="(\d+)(?: to | or | and |-)(\d+) %reUnit",NORM_VALUE="Pgroup(1)%normUnit4Duration(group(3))",OFFSET="group(1)-group(1)"
+RULENAME="duration_r5a2",EXTRACTION="%reApproximate %(reNumWord1D|reNumWord2D)(?: to | or | and |-)%(reNumWord1D|reNumWord2D) (seconds?|minutes?|hours?)",NORM_VALUE="PT%normDurationNumber(group(2))%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(1))",OFFSET="group(1)-group(2)"
+RULENAME="duration_r5b2",EXTRACTION="%(reNumWord1D|reNumWord2D)(?: to | or | and |-)%(reNumWord1D|reNumWord2D) (seconds?|minutes?|hours?)",NORM_VALUE="PT%normDurationNumber(group(1))%normUnit4Duration(group(3))",OFFSET="group(1)-group(1)"
+RULENAME="duration_r5c2",EXTRACTION="%reApproximate (\d+)(?: to | or | and |-)(\d+) (seconds?|minutes?|hours?)",NORM_VALUE="PTgroup(2)%normUnit4Duration(group(4))",NORM_MOD="%normApprox4Durations(group(1))",OFFSET="group(1)-group(2)"
+RULENAME="duration_r5d2",EXTRACTION="(\d+)(?: to | or | and |-)(\d+) (seconds?|minutes?|hours?)",NORM_VALUE="PTgroup(1)%normUnit4Duration(group(3))",OFFSET="group(1)-group(1)"
////////////////////
// NEGATIVE RULES //
@@ -75,12 +76,11 @@ RULENAME="duration_r5d2",EXTRACTION="([\d]+)( to | or | and |-)([\d]+) (seconds?
// EXAMPLE r1a_negative-1: about 200 years older
// EXAMPLE r1b_negative-1: several days old
// EXAMPLE r1c_negative-1: 59-year-old
-RULENAME="duration_r1a_negation",EXTRACTION="(%reApproximate |[Tt]he )?(%reNumWord2D|%reNumWord1D|[\d]+) (%reUnit|minutes?|hours?) (older|younger|old|young)",NORM_VALUE="REMOVE"
-RULENAME="duration_r1b_negation",EXTRACTION="(%reApproximate |[Tt]he )?([Ss]everal|[Aa] couple of|[Ss]ome|[Mm]any|[Aa] few|[Rr]ecent|[Cc]oming) (%reUnit|minutes?|hours?) (older|younger|old|young)",NORM_VALUE="REMOVE"
-RULENAME="duration_r1c_negation",EXTRACTION="([Tt]he )?(%reNumWord2D|%reNumWord1D|[\d]+)-(%reUnit|minutes?|hours?)-(older|younger|old|young)",NORM_VALUE="REMOVE"
-RULENAME="duration_r1d_negation",EXTRACTION="(%reApproximate )?(an|a)( )%reUnit-([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(5):NN:"
+RULENAME="duration_r1a_negation",EXTRACTION="(?:%reApproximate |[Tt]he |)(?:%(reNumWord1D|reNumWord2D)|[\d]+) %reUnitFine (?:older|younger|old|young)",NORM_VALUE="REMOVE"
+RULENAME="duration_r1b_negation",EXTRACTION="(?:%reApproximate |[Tt]he |)(?:[Ss]everal|[Aa] couple of|[Ss]ome|[Mm]any|[Aa] few|[Rr]ecent|[Cc]oming) %reUnitFine (?:older|younger|old|young)",NORM_VALUE="REMOVE"
+RULENAME="duration_r1c_negation",EXTRACTION="(?:[Tt]he |)(?:%(reNumWord1D|reNumWord2D)|\d+)-%reUnitFine-(?:older|younger|old|young)",NORM_VALUE="REMOVE"
+RULENAME="duration_r1d_negation",EXTRACTION="(?:%reApproximate |)an? %reUnit-(?:\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(4):NN:"
// NEW NEGATIVE RULES: 2015-03-18 (jannik)
-RULENAME="duration_r2a_negation",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D)(-| )quarters?",NORM_VALUE="REMOVE"
-RULENAME="duration_r2b_negation",EXTRACTION="(%reApproximate )?(a) quarter",NORM_VALUE="REMOVE"
\ No newline at end of file
+RULENAME="duration_r2a_negation",EXTRACTION="(?:%reApproximate )?(?:%(reNumWord1D|reNumWord2D)|a)[- ]quarters?",NORM_VALUE="REMOVE"
diff --git a/resources/english/rules/resources_rules_setrules.txt b/resources/english/rules/resources_rules_setrules.txt
index f4e4d38b..51c8eba3 100644
--- a/resources/english/rules/resources_rules_setrules.txt
+++ b/resources/english/rules/resources_rules_setrules.txt
@@ -14,7 +14,7 @@
// EXAMPLE r1d-1: every summer
RULENAME="set_r1a",EXTRACTION="([Ee]very|[Ee]ach) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(2)),0,1))",NORM_QUANT="%UPPERCASE%(group(1))"
RULENAME="set_r1b",EXTRACTION="([Ee]very|[Ee]ach) %reWeekday",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(2))",NORM_QUANT="%UPPERCASE%(group(1))",NORM_FREQ="1W"
-RULENAME="set_r1c",EXTRACTION="([Ee]very|[Ee]ach) (%reMonthLong|%reMonthShort)",NORM_VALUE="XXXX-%normMonth(group(2))",NORM_QUANT="%UPPERCASE%(group(1))",NORM_FREQ="1M"
+RULENAME="set_r1c",EXTRACTION="([Ee]very|[Ee]ach) %(reMonthLong|reMonthShort)",NORM_VALUE="XXXX-%normMonth(group(2))",NORM_QUANT="%UPPERCASE%(group(1))",NORM_FREQ="1M"
RULENAME="set_r1d",EXTRACTION="([Ee]very|[Ee]ach) %reSeason",NORM_VALUE="XXXX-%normSeason(group(2))",NORM_QUANT="%UPPERCASE%(group(1))",NORM_FREQ="1S"
// set_r2
@@ -24,22 +24,22 @@ RULENAME="set_r1d",EXTRACTION="([Ee]very|[Ee]ach) %reSeason",NORM_VALUE="XXXX-%n
// EXAMPLE r2d-1: 40 times per month
// EXAMPLE r2e-1: a month
// EXAMPLE r2f-1: a minute
-RULENAME="set_r2a",EXTRACTION="[Oo]nce (a|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(2)),0,1))",NORM_FREQ="1"
-RULENAME="set_r2b",EXTRACTION="[Tt]wice (a|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(2)),0,1))",NORM_FREQ="2"
-RULENAME="set_r2c",EXTRACTION="(%reNumWord1D|%reNumWord2D) times? (a|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(5)),0,1))",NORM_FREQ="%normDurationNumber(group(1))"
-RULENAME="set_r2d",EXTRACTION="([\d]+) times? (a|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(3)),0,1))",NORM_FREQ="group(1)"
-RULENAME="set_r2e",EXTRACTION="(a|an)( |-)%reUnit",NORM_VALUE="P1%normUnit4Duration(group(3))",NORM_FREQ="1"
-RULENAME="set_r2f",EXTRACTION="(a|an)( |-)(minutes?|hours?)",NORM_VALUE="PT1%normUnit4Duration(group(3))",NORM_FREQ="1"
+RULENAME="set_r2a",EXTRACTION="[Oo]nce (?:a|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(1)),0,1))",NORM_FREQ="1"
+RULENAME="set_r2b",EXTRACTION="[Tt]wice (?:a|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(1)),0,1))",NORM_FREQ="2"
+RULENAME="set_r2c",EXTRACTION="%(reNumWord1D|reNumWord2D) times? (?:a|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(2)),0,1))",NORM_FREQ="%normDurationNumber(group(1))"
+RULENAME="set_r2d",EXTRACTION="([\d]+) times? (?:a|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(2)),0,1))",NORM_FREQ="group(1)"
+RULENAME="set_r2e",EXTRACTION="(?:a|an)[\s-]%reUnit",NORM_VALUE="P1%normUnit4Duration(group(1))",NORM_FREQ="1"
+RULENAME="set_r2f",EXTRACTION="(?:a|an)[\s-](minutes?|hours?)",NORM_VALUE="PT1%normUnit4Duration(group(1))",NORM_FREQ="1"
// set_r3
// EXAMPLE r3a-1: every 5 years
// EXAMPLE r3b-1: every two days
-RULENAME="set_r3a",EXTRACTION="([Ee]very) ([\d]+) %reUnit",NORM_VALUE="Pgroup(2)%UPPERCASE%(%SUBSTRING%(%normUnit(group(3)),0,1))",NORM_QUANT="%UPPERCASE%(group(1))"
-RULENAME="set_r3b",EXTRACTION="([Ee]very) (%reNumWord1D|%reNumWord2D) %reUnit",NORM_VALUE="P%normDurationNumber(group(2))%UPPERCASE%(%SUBSTRING%(%normUnit(group(5)),0,1))",NORM_QUANT="%UPPERCASE%(group(1))"
+RULENAME="set_r3a",EXTRACTION="([Ee]very) (\d+) %reUnit",NORM_VALUE="Pgroup(2)%UPPERCASE%(%SUBSTRING%(%normUnit(group(3)),0,1))",NORM_QUANT="%UPPERCASE%(group(1))"
+RULENAME="set_r3b",EXTRACTION="([Ee]very) %(reNumWord1D|reNumWord2D) %reUnit",NORM_VALUE="P%normDurationNumber(group(2))%UPPERCASE%(%SUBSTRING%(%normUnit(group(3)),0,1))",NORM_QUANT="%UPPERCASE%(group(1))"
// set_r4
// EXAMPLE r4a-1: 2 days each week
-RULENAME="set_r4a",EXTRACTION="([\d]+) %reUnit (each|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(4)),0,1))",NORM_QUANT="EACH",NORM_FREQ="group(1)%normUnit(group(2))"
+RULENAME="set_r4a",EXTRACTION="([\d]+) %reUnit (?:each|per) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(3)),0,1))",NORM_QUANT="EACH",NORM_FREQ="group(1)%normUnit(group(2))"
// set_r5
// EXAMPLE r5a-1: annually
@@ -52,5 +52,5 @@ RULENAME="set_r5a",EXTRACTION="%reSetWords",NORM_VALUE="%normSetWords(group(1))"
// EXAMPLE r6a-1: Monday afternoons
// EXAMPLE r6b-1: Monday and Tuesday nights (find: Monday nights)
RULENAME="set_r6a",EXTRACTION="%reWeekday %rePartOfDay[s]",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(1))T%normPartOfDay(group(2))",NORM_FREQ="1W"
-RULENAME="set_r6b",EXTRACTION="%reWeekday (and|or) %reWeekday %rePartOfDay[s]",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(1))T%normPartOfDay(group(4))",NORM_FREQ="1W",OFFSET="group(1)-group(1)"
+RULENAME="set_r6b",EXTRACTION="%reWeekday (?:and|or) %reWeekday %rePartOfDay[s]",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(1))T%normPartOfDay(group(3))",NORM_FREQ="1W",OFFSET="group(1)-group(1)"
diff --git a/resources/english/rules/resources_rules_timerules.txt b/resources/english/rules/resources_rules_timerules.txt
index 6d77952c..84a1dc63 100644
--- a/resources/english/rules/resources_rules_timerules.txt
+++ b/resources/english/rules/resources_rules_timerules.txt
@@ -14,12 +14,12 @@
// EXAMPLE r1c-1: 12/29/2000 20:29
// EXAMPLE r1d-1: 12/29/2000 20:29:29
// EXAMPLE r1e-1: 12/29/2000 20:29:29.79
-RULENAME="time_r1a",EXTRACTION="(%reYear4Digit-%reMonthNumber-%reDayNumber)(T| )%reTimeHour:%reTimeMinute:%reTimeMinute",NORM_VALUE="group(2)-group(3)-group(4)Tgroup(6):group(7):group(8)"
-RULENAME="time_r1b",EXTRACTION="(%reYear4Digit-%reMonthNumber-%reDayNumber)(T| )%reTimeHour:%reTimeMinute",NORM_VALUE="group(2)-group(3)-group(4)Tgroup(6):group(7)"
-RULENAME="time_r1c",EXTRACTION="(%reMonthNumber/%reDayNumber/%reYear4Digit) %reTimeHour:%reTimeMinute",NORM_VALUE="group(4)-group(2)-group(3)Tgroup(5):group(6)"
-RULENAME="time_r1d",EXTRACTION="(%reMonthNumber/%reDayNumber/%reYear4Digit) %reTimeHour:%reTimeMinute:%reTimeMinute",NORM_VALUE="group(4)-group(2)-group(3)Tgroup(5):group(6):group(7)"
-RULENAME="time_r1e",EXTRACTION="(%reMonthNumber/%reDayNumber/%reYear4Digit) %reTimeHour:%reTimeMinute:%reTimeMinute\.%reYear2Digit",NORM_VALUE="group(4)-group(2)-group(3)Tgroup(5):group(6):group(7).group(8)"
-RULENAME="time_r1f",EXTRACTION="%reYear4Digit%reMonthNumber%reDayNumber-%reTimeHour(?:-|:)%reTimeMinute(?:-|:)%reTimeMinute",NORM_VALUE="group(1)-group(2)-group(3)Tgroup(4):group(5):group(6)"
+RULENAME="time_r1a",EXTRACTION="%reYear4Digit-%reMonthNumber-%reDayNumber[T ]%reTimeHour:%reTimeMinute:%reTimeMinute",NORM_VALUE="group(1)-group(2)-group(3)Tgroup(4):group(5):group(6)"
+RULENAME="time_r1b",EXTRACTION="%reYear4Digit-%reMonthNumber-%reDayNumber[T ]%reTimeHour:%reTimeMinute",NORM_VALUE="group(1)-group(2)-group(3)Tgroup(4):group(5)"
+RULENAME="time_r1c",EXTRACTION="%reMonthNumber/%reDayNumber/%reYear4Digit %reTimeHour:%reTimeMinute",NORM_VALUE="group(3)-group(1)-group(2)Tgroup(4):group(5)"
+RULENAME="time_r1d",EXTRACTION="%reMonthNumber/%reDayNumber/%reYear4Digit %reTimeHour:%reTimeMinute:%reTimeMinute",NORM_VALUE="group(3)-group(1)-group(2)Tgroup(4):group(5):group(6)"
+RULENAME="time_r1e",EXTRACTION="%reMonthNumber/%reDayNumber/%reYear4Digit %reTimeHour:%reTimeMinute:%reTimeMinute\.%reYear2Digit",NORM_VALUE="group(3)-group(1)-group(2)Tgroup(4):group(5):group(6).group(7)"
+RULENAME="time_r1f",EXTRACTION="%reYear4Digit%reMonthNumber%reDayNumber-%reTimeHour[-:]%reTimeMinute[-:]%reTimeMinute",NORM_VALUE="group(1)-group(2)-group(3)Tgroup(4):group(5):group(6)"
// time_r2
// EXAMPLE r2a-1: 09-24-99 1145EST (TimeStamp style with timezone information)
@@ -27,10 +27,10 @@ RULENAME="time_r1f",EXTRACTION="%reYear4Digit%reMonthNumber%reDayNumber-%reTimeH
// EXAMPLE r2c-1: Wed, 29 Dec 2004 00:28:16 +0000
// EXAMPLE r2d-1: Sat, 29 Jan 2005 17:21:13 -0600
// EXAMPLE r2d-2: 1 Feb 2005 16:13:33 +1300
-RULENAME="time_r2a",EXTRACTION="(%reMonthNumber-%reDayNumber-%reYear2Digit)( %reTimeHour(:)?%reTimeMinute)%reTimezone",NORM_VALUE="UNDEF-centurygroup(4)-group(2)-group(3)T%normMinute(group(6)):%normMinute(group(8))%normTimezone(group(9))"
-RULENAME="time_r2b",EXTRACTION="%reMonthLong %reDayNumber, %reYear4Digit %reTimeHour(:)?%reTimeMinute %reTimezone",NORM_VALUE="group(3)-%normMonth(group(1))-%normDay(group(2))T%normMinute(group(4)):%normMinute(group(6))"
-RULENAME="time_r2c",EXTRACTION="((Mon|Tue|Wed|Thu|Fri|Sat|Sun), )?%reDayNumber (%reMonthLong|%reMonthShort) %reYear4Digit %reTimeHour:%reTimeMinute:%reTimeMinute (\+|-)0000",NORM_VALUE="group(7)-%normMonth(group(4))-%normDay(group(3))Tgroup(8):group(9):group(10)"
-RULENAME="time_r2d",EXTRACTION="((Mon|Tue|Wed|Thu|Fri|Sat|Sun), )?%reDayNumber (%reMonthLong|%reMonthShort) %reYear4Digit %reTimeHour:%reTimeMinute:%reTimeMinute (\+|-)(\d\d)00",NORM_VALUE="group(7)-%normMonth(group(4))-%normDay(group(3))Tgroup(8):group(9):group(10)group(11)group(12)"
+RULENAME="time_r2a",EXTRACTION="%reMonthNumber-%reDayNumber-%reYear2Digit %reTimeHour:?%reTimeMinute%reTimezone",NORM_VALUE="UNDEF-centurygroup(3)-group(1)-group(2)T%normMinute(group(4)):%normMinute(group(5))%normTimezone(group(6))"
+RULENAME="time_r2b",EXTRACTION="%reMonthLong %reDayNumber, %reYear4Digit %reTimeHour:?%reTimeMinute %reTimezone",NORM_VALUE="group(3)-%normMonth(group(1))-%normDay(group(2))T%normMinute(group(4)):%normMinute(group(5))"
+// matched by r2d RULENAME="time_r2c",EXTRACTION="(?:Mon, |Tue, |Wed, |Thu, |Fri, |Sat, |Sun, )?%reDayNumber %(reMonthLong|reMonthShort) %reYear4Digit %reTimeHour:%reTimeMinute:%reTimeMinute (\+|-)0000",NORM_VALUE="group(3)-%normMonth(group(2))-%normDay(group(1))Tgroup(4):group(5):group(6)"
+RULENAME="time_r2d",EXTRACTION="(?:Mon, |Tue, |Wed, |Thu, |Fri, |Sat, |Sun, )?%reDayNumber %(reMonthLong|reMonthShort) %reYear4Digit %reTimeHour:%reTimeMinute:%reTimeMinute (\+|-)(\d\d)00",NORM_VALUE="group(3)-%normMonth(group(2))-%normDay(group(1))Tgroup(4):group(5):group(6)group(7)group(8)"
RULENAME="time_r2e",EXTRACTION="(%reMonthLong %reDayNumber, %reTimeHour %reTimezone) in %reYear4Digit",NORM_VALUE="group(6)-%normMonth(group(2))-%normDay(group(3))T%normMinute(group(4)):00",OFFSET="group(1)-group(1)"
RULENAME="time_r2f",EXTRACTION="(%reMonthLong %reDayNumber, %reTimeHour:%reTimeMinute%reTimezone) in %reYear4Digit",NORM_VALUE="group(7)-%normMonth(group(2))-%normDay(group(3))T%normMinute(group(4)):%normMinute(group(5))",OFFSET="group(1)-group(1)"
@@ -59,7 +59,7 @@ RULENAME="time_r3e",EXTRACTION="%reThisNextLast %reWeekday %rePartOfDay",NORM_VA
// EXAMPLE r4a-1: earlier this afternoon
// EXAMPLE r4a-2: later last night
// EXAMPLE r4b-1: tonight
-RULENAME="time_r4a",EXTRACTION="(([Ee]arlier|[Ll]ater|[Ee]arly) )?%reThisNextLast %rePartOfDay",NORM_VALUE="UNDEF-%normThisNextLast(group(3))-dayT%normPartOfDay(group(4))"
+RULENAME="time_r4a",EXTRACTION="(?:[Ee]arlier |[Ll]ater |[Ee]arly )?%reThisNextLast %rePartOfDay",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-dayT%normPartOfDay(group(2))"
RULENAME="time_r4b",EXTRACTION="([Tt]onight)",NORM_VALUE="UNDEF-this-dayT%normPartOfDay(group(1))"
///////////////////////////
@@ -72,15 +72,15 @@ RULENAME="time_r4b",EXTRACTION="([Tt]onight)",NORM_VALUE="UNDEF-this-dayT%normPa
// EXAMPLE r5c-1: 11:30 a.m.
// EXAMPLE r5d-1: 9:30 p.m.
// EXAMPLE r5e-1: 10:30:34 a.m.
-// EXAMPLE r5e-1: 10:30:34 p.m.
-RULENAME="time_r5a",EXTRACTION="(%reApproximate )?%reTimeHour[\s]*[Aa][\.]?[Mm][\.]?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(3)):00",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="time_r5b",EXTRACTION="(%reApproximate )?%reTimeHour[\s]*[Pp][\.]?[Mm][\.]?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(3)),12):00",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="time_r5c",EXTRACTION="(%reApproximate )?%reTimeHour:%reTimeMinute[\s]*[Aa][\.]?[Mm][\.]?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(3)):group(4)",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="time_r5d",EXTRACTION="(%reApproximate )?%reTimeHour:%reTimeMinute[\s]*[Pp][\.]?[Mm][\.]?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(3)),12):group(4)",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="time_r5e",EXTRACTION="(%reApproximate )?%reTimeHour:%reTimeMinute:%reTimeMinute[\s]*[Aa][\.]?[Mm][\.]?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(3)):group(4)",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="time_r5f",EXTRACTION="(%reApproximate )?%reTimeHour:%reTimeMinute:%reTimeMinute[\s]*[Pp][\.]?[Mm][\.]?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(3)),12):group(4)",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="time_r5g",EXTRACTION="(%reApproximate )?%reTimeHour%reTimeMinute[\s]*[Aa][\.]?[Mm][\.]?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(3)):group(4)",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="time_r5h",EXTRACTION="(%reApproximate )?%reTimeHour%reTimeMinute[\s]*[Pp][\.]?[Mm][\.]?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(3)),12):group(4)",NORM_MOD="%normApprox4Dates(group(2))"
+// EXAMPLE r5f-1: 10:30:34 p.m.
+RULENAME="time_r5a",EXTRACTION="(?:%reApproximate )?%reTimeHour\s*[Aa]\.?[Mm]\.?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):00",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="time_r5b",EXTRACTION="(?:%reApproximate )?%reTimeHour\s*[Pp]\.?[Mm]\.?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(2)),12):00",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="time_r5c",EXTRACTION="(?:%reApproximate )?%reTimeHour:%reTimeMinute\s*[Aa]\.?[Mm]\.?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):group(3)",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="time_r5d",EXTRACTION="(?:%reApproximate )?%reTimeHour:%reTimeMinute\s*[Pp]\.?[Mm]\.?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(2)),12):group(3)",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="time_r5e",EXTRACTION="(?:%reApproximate )?%reTimeHour:%reTimeMinute:%reTimeMinute\s*[Aa]\.?[Mm]\.?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):group(3):group(4)",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="time_r5f",EXTRACTION="(?:%reApproximate )?%reTimeHour:%reTimeMinute:%reTimeMinute\s*[Pp]\.?[Mm]\.?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(2)),12):group(3):group(4)",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="time_r5g",EXTRACTION="(?:%reApproximate )?%reTimeHour%reTimeMinute\s*[Aa]\.?[Mm]\.?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):group(3)",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="time_r5h",EXTRACTION="(?:%reApproximate )?%reTimeHour%reTimeMinute\s*[Pp]\.?[Mm]\.?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(2)),12):group(3)",NORM_MOD="%normApprox4Dates(group(1))"
// time_r6
@@ -89,24 +89,24 @@ RULENAME="time_r5h",EXTRACTION="(%reApproximate )?%reTimeHour%reTimeMinute[\s]*[
// EXAMPLE r6b-1: 9 pm Wednesday
// EXAMPLE r6c-1: 9:30 a.m. Wednesday
// EXAMPLE r6d-1: 9:30 p.m. Wednesday
-RULENAME="time_r6a",EXTRACTION="(%reApproximate )?%reTimeHour[\s]*[Aa][\.]?[Mm][\.]? %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(4))T%normDay(group(3)):00",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="time_r6b",EXTRACTION="(%reApproximate )?%reTimeHour[\s]*[Pp][\.]?[Mm][\.]? %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(4))T%SUM%(%normDay(group(3)),12):00",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="time_r6c",EXTRACTION="(%reApproximate )?%reTimeHour:%reTimeMinute[\s]*[Aa][\.]?[Mm][\.]? %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(5))T%normDay(group(3)):group(4)",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="time_r6d",EXTRACTION="(%reApproximate )?%reTimeHour:%reTimeMinute[\s]*[Pp][\.]?[Mm][\.]? %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(5))T%SUM%(%normDay(group(3)),12):group(4)",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="time_r6e",EXTRACTION="(%reApproximate )?%reTimeHour:%reTimeMinute %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(5))T%normDay(group(3)):group(4)",NORM_MOD="%normApprox4Dates(group(2))"
+RULENAME="time_r6a",EXTRACTION="(?:%reApproximate )?%reTimeHour\s*[Aa]\.?[Mm]\.? %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(3))T%normDay(group(2)):00",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="time_r6b",EXTRACTION="(?:%reApproximate )?%reTimeHour\s*[Pp]\.?[Mm]\.? %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(3))T%SUM%(%normDay(group(2)),12):00",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="time_r6c",EXTRACTION="(?:%reApproximate )?%reTimeHour:%reTimeMinute\s*[Aa]\.?[Mm]\.? %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(4))T%normDay(group(2)):group(3)",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="time_r6d",EXTRACTION="(?:%reApproximate )?%reTimeHour:%reTimeMinute\s*[Pp]\.?[Mm]\.? %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(4))T%SUM%(%normDay(group(2)),12):group(3)",NORM_MOD="%normApprox4Dates(group(1))"
+RULENAME="time_r6e",EXTRACTION="(?:%reApproximate )?%reTimeHour:%reTimeMinute %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(4))T%normDay(group(2)):group(3)",NORM_MOD="%normApprox4Dates(group(1))"
// time_r7
// added 2015-03-18 (jannik)
-RULENAME="time_r7a",EXTRACTION="(%reApproximate )?%reTimeHour%reTimeMinute %reTimezone",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(3)):group(4)",NORM_MOD="%normApprox4Dates(group(2))"
-RULENAME="time_r7b",EXTRACTION="((%reApproximate )?%reTimeHour%reTimeMinute %reTimezone) on %reMonthLong %reDayNumber",NORM_VALUE="UNDEF-year-%normMonth(group(7))-%normDay(group(8))T%normDay(group(4)):group(5)",NORM_MOD="%normApprox4Dates(group(3))",OFFSET="group(1)-group(1)"
+RULENAME="time_r7a",EXTRACTION="(%reApproximate )?%reTimeHour:?%reTimeMinute %reTimezone",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(3)):group(4)",NORM_MOD="%normApprox4Dates(group(2))"
+RULENAME="time_r7b",EXTRACTION="((%reApproximate )?%reTimeHour:?%reTimeMinute %reTimezone) on %reMonthLong %reDayNumber",NORM_VALUE="UNDEF-year-%normMonth(group(7))-%normDay(group(8))T%normDay(group(4)):group(5)",NORM_MOD="%normApprox4Dates(group(3))",OFFSET="group(1)-group(1)"
// time_r8
// EXAMPLE r8a: the morning of April 18, 1775
-// EXAMPLE r8c: the morning of April 18
-RULENAME="time_r8a",EXTRACTION="([Tt]he )?%rePartOfDay of (%reMonthLong|%reMonthShort) (%reDayNumberTh|%reDayNumber)[\s]?,? %reYear4Digit(, %reWeekday)?",NORM_VALUE="group(9)-%normMonth(group(3))-%normDay(group(6))T%normPartOfDay(group(2))"
-RULENAME="time_r8b",EXTRACTION="([Tt]he )?%rePartOfDay of (%reMonthLong|%reMonthShort) (%reDayNumberTh|%reDayNumber)",NORM_VALUE="UNDEF-year-%normMonth(group(3))-%normDay(group(6))T%normPartOfDay(group(2))"
-RULENAME="time_r8c",EXTRACTION="([Tt]he )?%rePartOfDay of (the )?(%reDayWordTh|%reDayNumberTh|%reDayNumber)( of | )(%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(12)-%normMonth(group(9))-%normDay(group(4))T%normPartOfDay(group(2))"
-RULENAME="time_r8d",EXTRACTION="([Tt]he )?%rePartOfDay of (the )?(%reDayWordTh|%reDayNumberTh|%reDayNumber)( of | )(%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(9))-%normDay(group(4)))T%normPartOfDay(group(2))"
+// EXAMPLE r8b: the morning of April 18
+RULENAME="time_r8a",EXTRACTION="(?:[Tt]he )?%rePartOfDay of %(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh)\s?,? %reYear4Digit(?:, %reWeekday)?",NORM_VALUE="group(4)-%normMonth(group(2))-%normDay(group(3))T%normPartOfDay(group(1))"
+RULENAME="time_r8b",EXTRACTION="(?:[Tt]he )?%rePartOfDay of %(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh)",NORM_VALUE="UNDEF-year-%normMonth(group(2))-%normDay(group(3))T%normPartOfDay(group(1))"
+RULENAME="time_r8c",EXTRACTION="(?:[Tt]he )?%rePartOfDay of (?:the )?%(reDayNumber|reDayNumberTh|reDayWordTh) (?:of )?%(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(3))-%normDay(group(2))T%normPartOfDay(group(1))"
+RULENAME="time_r8d",EXTRACTION="(?:[Tt]he )?%rePartOfDay of (?:the )?%(reDayNumber|reDayNumberTh|reDayWordTh) (?:of )?%(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(3))-%normDay(group(2))T%normPartOfDay(group(1))"
diff --git a/resources/german/normalization/resources_normalization_normMonth.txt b/resources/german/normalization/resources_normalization_normMonth.txt
index 028420ee..602e6e7f 100644
--- a/resources/german/normalization/resources_normalization_normMonth.txt
+++ b/resources/german/normalization/resources_normalization_normMonth.txt
@@ -1,89 +1,61 @@
// author: Jannik Strötgen
-// email: stroetgen@uni-hd.de
+// email: stroetgen@uni-hd\.de
// date: 2011-06-10
// This file contains "month words" and their normalized expressions
-// according to TIMEX3 format.
+// according to TIMEX3 format\.
// For example, the normalized value of "2" is "02"
// FORMAT: "month-word","normalized-month-word"
// Full month names (including historic writings)
// January
-"[Jj]anuar","01"
+"[Jj]anuar(ii|y|)","01"
"[Jj]änner","01"
-"[Jj]anuarii","01"
-"[Ff]ebruar","02"
-"[Ff]ebruar(ii)?","02"
+"[Ff]ebruar(ii|y|)","02"
"[Hh]ornung","02"
"[Mm]ärz","03"
+"[Mm]arch","03"
"[Mm]aerz","03"
"[Mm]art(ii)?","03"
"[Mm]erz","03"
-"[Aa]pril","04"
-"[Aa]prilis","04"
-"[Mm]ai","05"
-"[Mm]ai(i)?","05"
+"[Aa]pril(is)?","04"
+"[Mm]aii?","05"
"[Mm]ay","05"
-"[Jj]uni","06"
-"[Jj]unii","06"
-"[Jj]uli","07"
-"[Jj]ulii","07"
-"[Aa]ugust","08"
-"[Aa]ugusti","08"
+"[Jj]un(ii?|y)","06"
+"[Jj]ul(ii?|y)","07"
+"[Aa]ugusti?","08"
"[Ss]eptember","09"
-"[Ss]eptembr(is|.)?","09"
-"[Oo]ktober","10"
-"[Oo]ctober","10"
-"[Oo]ctobr(is|.)","10"
+"[Ss]eptembr(is|\.)?","09"
+"[Oo][ck]tober","10"
+"[Oo]ctobr(is|\.)?","10"
"[Nn]ovember","11"
-"[Nn]ovembr(is|.)","11"
-"[Dd]ezember","12"
-"[Dd]ecember","12"
-"[Dd]ecembr(is|.)","12"
+"[Nn]ovembr(is|\.)?","11"
+"[Dd]e[cz]ember","12"
+"[Dd]ecembr(is|\.)?","12"
// Abbreviated month names
-"[Jj]an","01"
-"[Jj]an.","01"
-"[Ff]eb","02"
-"[Ff]eb.","02"
-"[Mm]är","03"
-"[Mm]är.","03"
-"[Aa]pr","04"
-"[Aa]pr.","04"
-"[Mm]ai.","05"
-"[Jj]un","06"
-"[Jj]un.","06"
-"[Jj]ul","07"
-"[Jj]ul.","07"
-"[Aa]ug","08"
-"[Aa]ug.","08"
-"[Ss]ept?","09"
-"[Ss]ept?.","09"
-"[Oo]kt","10"
-"[Oo]kt.","10"
-"[Nn]ov","11"
-"[Nn]ov.","11"
-"[Dd]ez","12"
-"[Dd]ez.","12"
+"[Jj]an\.?","01"
+"[Ff]eb\.?","02"
+"[Mm][aä]r\.?","03"
+"[Aa]pr\.?","04"
+"[Mm]a[iy]","05"
+"[Jj]un\.?","06"
+"[Jj]ul\.?","07"
+"[Aa]ug\.?","08"
+"[Ss]ept?\.?","09"
+"[Oo][ck]t\.?","10"
+"[Nn]ov\.?","11"
+"[Dd]e[cz]\.?","12"
// numbers
-"1","01"
-"01","01"
-"2","02"
-"02","02"
-"3","03"
-"03","03"
-"4","04"
-"04","04"
-"5","05"
-"05","05"
-"6","06"
-"06","06"
-"7","07"
-"07","07"
-"8","08"
-"08","08"
-"9","09"
-"09","09"
-"10","10"
-"11","11"
-"12","12"
+"0?1\.?","01"
+"0?2\.?","02"
+"0?3\.?","03"
+"0?4\.?","04"
+"0?5\.?","05"
+"0?6\.?","06"
+"0?7\.?","07"
+"0?8\.?","08"
+"0?9\.?","09"
+"10\.?","10"
+"11\.?","11"
+"12\.?","12"
diff --git a/resources/german/normalization/resources_normalization_normMonthToEnglish.txt b/resources/german/normalization/resources_normalization_normMonthToEnglish.txt
index 54c5d2f0..30ccc60e 100644
--- a/resources/german/normalization/resources_normalization_normMonthToEnglish.txt
+++ b/resources/german/normalization/resources_normalization_normMonthToEnglish.txt
@@ -1,78 +1,53 @@
// author: Jannik Strötgen
-// email: stroetgen@uni-hd.de
+// email: stroetgen@uni-hd\.de
// date: 2011-09-13
// This file contains "month words" and their English expressions
-// according to TIMEX3 format.
+// according to TIMEX3 format\.
// For example, the normalized value of "2" is "february"
// FORMAT: "month-word","normalized-month-word"
-
-
// January
-"[Jj]an","january"
-"[Jj]an.","january"
-"[Jj]anuar","january"
-"[Jj]anuarii","january"
+"[Jj]an\.?","january"
+"[Jj]anuar(ii|y|)","january"
"[Jj]änner","january"
// February
-"[Ff]eb","february"
-"[Ff]eb.","february"
-"[Ff]ebruar","february"
-"[Ff]ebruarii","february"
+"[Ff]eb\.?","february"
+"[Ff]ebruar(ii|y|)","february"
"[Hh]ornung","february"
// March
-"[Mm]är","march"
-"[Mm]är.","march"
+"[Mm]är[z.]?","march"
+"[Mm]arch","march"
"[Mm]aerz","march"
-"[Mm]ärz","march"
"[Mm]art(ii)?","march"
"[Mm]erz","march"
// April
-"[Aa]pr","april"
-"[Aa]pr.","april"
-"[Aa]pril","april"
-"[Aa]prilis","april"
+"[Aa]pr\.?","april"
+"[Aa]pril(is)?","april"
// May
-"[Mm]ai","may"
-"[Mm]ai.","may"
-"[Mm]aii","may"
-"[Mm]ay","may"
+"[Mm]a(ii?|y)\.?","may"
// June
-"[Jj]un","june"
-"[Jj]un.","june"
-"[Jj]uni","june"
-"[Jj]unii","june"
+"[Jj]un\.?","june"
+"[Jj]un(ii?|y)","june"
// July
-"[Jj]ul","july"
-"[Jj]ul.","july"
-"[Jj]uli","july"
-"[Jj]ulii","july"
+"[Jj]ul\.?","july"
+"[Jj]ul(ii?|y)","july"
// August
-"[Aa]ug","august"
-"[Aa]ug.","august"
+"[Aa]ug\.?","august"
"[Aa]ugust","august"
-"[Aa]ugusti","august"
+"[Aa]ugusti?","august"
// September
-"[Ss]ept?","september"
-"[Ss]ept?.","september"
-"Sept","september"
-"sept","september"
+"[Ss]ept?\.?","september"
"[Ss]eptember","september"
-"[Ss]eptembr(is|.)?","september"
+"[Ss]eptembr(is|\.)?","september"
// October
-"[Oo]kt","october"
-"[Oo]kt.","october"
-"[Oo]ktober","october"
-"[Oo]ctober","october"
-"[Oo]ctobr(is|.)","october"
+"[Oo][ck]t\.?","october"
+"[Oo][ck]tober","october"
+"[Oo]ctobr(is|\.)?","october"
// November
-"[Nn]ov","november"
-"[Nn]ov.","november"
+"[Nn]ov\.?","november"
"[Nn]ovember","november"
-"[Nn]ovembr(is|.)","november"
+"[Nn]ovembr(is|\.)?","november"
// December
-"[Dd]ez","december"
-"[Dd]ez.","december"
-"[Dd]ezember","december"
-"[Dd]ecember","december"
-"[Dd]ecembr(is|.)","december"
+"[Dd]e[cz]\.?","december"
+"[Dd]e[cz]ember","december"
+"[Dd]ecembr(is|\.)?","december"
diff --git a/resources/german/normalization/resources_normalization_normPartOfDay.txt b/resources/german/normalization/resources_normalization_normPartOfDay.txt
index d017ddf1..a682142c 100644
--- a/resources/german/normalization/resources_normalization_normPartOfDay.txt
+++ b/resources/german/normalization/resources_normalization_normPartOfDay.txt
@@ -6,33 +6,12 @@
// For example, the normalized value of "vormittags" is "MO"
// FORMAT: "times-of-day-word","normalized-times-of-day-word"
// morning
-"morgens","MO"
-"Morgens","MO"
-"morgen","MO"
-"Morgen","MO"
-"vormittags","MO"
-"Vormittags","MO"
-"vormittag","MO"
-"Vormittag","MO"
+"[Mm]orgen(?:stunden|s|)","MO"
+"[Vv]ormittags?","MO"
// mid-day
-"mittag","12:00"
-"Mittag","12:00"
-"mittags","12:00"
-"Mittags","12:00"
-"nachmittag","AF"
-"Nachmittag","AF"
-"nachmittags","AF"
-"Nachmittags","AF"
-"abend","EV"
-"Abend","EV"
-"abends","EV"
-"Abends","EV"
+"[Mm]ittags?(?:stunden|)","12:00"
+"[Nn]achmittags?","AF"
+"[Aa]bend(?:stunden|s|)","EV"
// night
-"nacht","NI"
-"Nacht","NI"
-"nachts","NI"
-"Nachts","NI"
-"mitternacht","24:00"
-"Mitternacht","24:00"
-"mitternachts","24:00"
-"Mitternachts","24:00"
+"[Nn]acht(?:stunden|s|)","NI"
+"[Mm]itternachts?","24:00"
diff --git a/resources/german/normalization/resources_normalization_normThisNextLast.txt b/resources/german/normalization/resources_normalization_normThisNextLast.txt
index c8bebd50..9ce6f76e 100644
--- a/resources/german/normalization/resources_normalization_normThisNextLast.txt
+++ b/resources/german/normalization/resources_normalization_normThisNextLast.txt
@@ -3,93 +3,13 @@
// date: 2011-06-10
// This file contains "this/next/last words" and their normalized expressions.
// FORMAT: "this/next/last-word","normalized-this/next/last-word"
-"letzte","last"
-"letzter","last"
-"letztes","last"
-"letzten","last"
-"letztem","last"
-"Letzte","last"
-"Letzter","last"
-"Letztes","last"
-"Letzten","last"
-"Letztem","last"
-"nächste","next"
-"nächster","next"
-"nächstes","next"
-"nächsten","next"
-"nächstem","next"
-"Nächste","next"
-"Nächster","next"
-"Nächstes","next"
-"Nächsten","next"
-"Nächstem","next"
-"folgende","next"
-"folgender","next"
-"folgendes","next"
-"folgenden","next"
-"folgendem","next"
-"Folgende","next"
-"Folgender","next"
-"Folgendes","next"
-"Folgenden","next"
-"Folgendem","next"
-"vorige","last"
-"voriger","last"
-"voriges","last"
-"vorigen","last"
-"vorigem","last"
-"Vorige","last"
-"Voriger","last"
-"Voriges","last"
-"Vorigen","last"
-"Vorigem","last"
-"diese","this"
-"dieser","this"
-"dieses","this"
-"diesen","this"
-"diesem","this"
-"Diese","this"
-"Dieser","this"
-"Dieses","this"
-"Diesen","this"
-"Diesem","this"
-"Selbe","this"
-"Selber","this"
-"Selbes","this"
-"Selben","this"
-"Selbem","this"
-"selbe","this"
-"selber","this"
-"selbes","this"
-"selben","this"
-"selbem","this"
-"Gleiche","this"
-"Gleicher","this"
-"Gleiches","this"
-"Gleichen","this"
-"Gleichem","this"
-"gleiche","this"
-"gleicher","this"
-"gleiches","this"
-"gleichen","this"
-"gleichem","this"
-"Kommende","next"
-"Kommender","next"
-"Kommendes","next"
-"Kommenden","next"
-"Kommendem","next"
-"kommende","next"
-"kommender","next"
-"kommendes","next"
-"kommenden","next"
-"kommendem","next"
-"Vergangene","last"
-"Vergangener","last"
-"Vergangenes","last"
-"Vergangenen","last"
-"Vergangenem","last"
-"vergangene","last"
-"vergangener","last"
-"vergangenes","last"
-"vergangenen","last"
-"vergangenem","last"
+"[Ll]etzte[rsnm]?","last"
+"[Nn]ächste[rsnm]?","next"
+"[Nn]eue[rsnm]?","next"
+"[Ff]olgende[rsnm]?","next"
+"[Vv]orige[rsnm]?","last"
+"[Dd]iese[rsnm]?","this"
+"[Ss]elbe[rsnm]?","this"
+"[Gg]leiche[rsnm]?","this"
+"[Kk]ommende[rsnm]?","next"
+"[Vv]ergangene[rsnm]?","last"
diff --git a/resources/german/normalization/resources_normalization_normUnit.txt b/resources/german/normalization/resources_normalization_normUnit.txt
index 82a88e9c..1ed165da 100644
--- a/resources/german/normalization/resources_normalization_normUnit.txt
+++ b/resources/german/normalization/resources_normalization_normUnit.txt
@@ -5,60 +5,17 @@
// according to TIMEX3 format.
// For example, the normalized value of "Woche" is "week"
// FORMAT: "unit-word","normalized-unit-word"
-"Handelstag","day"
-"Handelstage","day"
-"Handelstagen","day"
-"Tag","day"
-"Tage","day"
-"Tagen","day"
-"Wochenende","week-WE"
-"Wochenenden","week-WE"
-"Woche","week"
-"Wochen","week"
-"Monat","month"
-"Monate","month"
-"Monaten","month"
-"Quartal","quarter"
-"Quartale","quarter"
-"Quartalen","quarter"
-"Jahr","year"
-"Jahre","year"
-"Jahren","year"
-"Jahrzehnt","decade"
-"Jahrzehnte","decade"
-"Jahrzehnten","decade"
-"Jahrhundert","century"
-"Jahrhunderte","century"
-"Jahrhunderten","century"
-"Jh\.","century"
-// LOWERCASE
-"handelstag","day"
-"handelstage","day"
-"handelstagen","day"
-"tag","day"
-"tage","day"
-"tagen","day"
-"wochenende","week-WE"
-"wochenenden","week-WE"
-"woche","week"
-"wochen","week"
-"monat","month"
-"monate","month"
-"monaten","month"
-"quartal","quarter"
-"quartale","quarter"
-"quartalen","quarter"
-"jahr","year"
-"jahre","year"
-"jahren","year"
-"jahrzehnt","decade"
-"jahrzehnte","decade"
-"jahrzehnten","decade"
-"jahrhundert","century"
-"jahrhunderte","century"
-"jahrhunderten","century"
-// not in reUnit
-"Stunde","hour"
-"Stunden","hour"
-"Minute","minute"
-"Minuten","minute"
+"[Hh]andelstage?n?","day"
+"[Tt]age?n?","day"
+"[Ww]ochenenden?","week-WE"
+"[Ww]ochen?","week"
+"[Mm]onate?n?","month"
+"[Qq]uartale?n?","quarter"
+"[Jj]ahre?n?","year"
+"[Jj]ahrzehnte?n?","decade"
+"[Jj]ahrhunderte?n?","century"
+"[Jj]hd?t?\.","century"
+// not in reUnit, but reUnitFine
+"[Ss]tunden?","hour"
+"[Mm]inuten?","minute"
+"[Ss]ekunden?","second"
diff --git a/resources/german/normalization/resources_normalization_normUnit4Duration.txt b/resources/german/normalization/resources_normalization_normUnit4Duration.txt
index 2c87d67f..56c1f11d 100644
--- a/resources/german/normalization/resources_normalization_normUnit4Duration.txt
+++ b/resources/german/normalization/resources_normalization_normUnit4Duration.txt
@@ -5,60 +5,17 @@
// according to TIMEX3 format.
// For example, the normalized value of "Woche" is "WE"
// FORMAT: "unit-word","normalized-unit-word"
-"Handelstag","D"
-"Handelstage","D"
-"Handelstagen","D"
-"Tag","D"
-"Tage","D"
-"Tagen","D"
-"Wochenende","WE"
-"Wochenenden","WE"
-"Woche","W"
-"Wochen","W"
-"Monat","M"
-"Monate","M"
-"Monaten","M"
-"Quartal","Q"
-"Quartale","Q"
-"Quartalen","Q"
-"Jahr","Y"
-"Jahre","Y"
-"Jahren","Y"
-"Jahrzehnt","DE"
-"Jahrzehnte","DE"
-"Jahrzehnten","DE"
-"Jahrhundert","CE"
-"Jahrhunderte","CE"
-"Jahrhunderten","CE"
-"Jh\.","CE"
-// LOWERCASE
-"handelstag","D"
-"handelstage","D"
-"handelstagen","D"
-"tag","D"
-"tage","D"
-"tagen","D"
-"wochenende","WE"
-"wochenenden","WE"
-"woche","W"
-"wochen","W"
-"monat","M"
-"monate","M"
-"monaten","M"
-"quartal","Q"
-"quartale","Q"
-"quartalen","Q"
-"jahr","Y"
-"jahre","Y"
-"jahren","Y"
-"jahrzehnt","DE"
-"jahrzehnte","DE"
-"jahrzehnten","DE"
-"jahrhundert","CE"
-"jahrhunderte","CE"
-"jahrhunderten","CE"
+"[Hh]andelstage?n?","D"
+"[Tt]age?n?","D"
+"[Ww]ochenenden?","WE"
+"[Ww]ochen?","W"
+"[Mm]onate?n?","M"
+"[Qq]uartale?n?","Q"
+"[Jj]ahre?n?","Y"
+"[Jj]ahrzehnte?n?","DE"
+"[Jj]ahrhunderte?n?","CE"
+"[Jj]hd?t?\.","CE"
// not in reUnit
-"Stunde","H"
-"Stunden","H"
-"Minute","M"
-"Minuten","M"
+"[Ss]tunden?","H"
+"[Mm]inuten?","M"
+"[Ss]ekunden?","S"
diff --git a/resources/german/repattern/resources_repattern_reAndOrTo.txt b/resources/german/repattern/resources_repattern_reAndOrTo.txt
index 4e1c8fb8..9b764911 100644
--- a/resources/german/repattern/resources_repattern_reAndOrTo.txt
+++ b/resources/german/repattern/resources_repattern_reAndOrTo.txt
@@ -3,10 +3,5 @@
// date: 2013-10-17
// This file contains regular expression patterns for "and", "or", "to" words.
// FORMAT: one line is one disjunction of the pattern
-[\s]?\–[\s]?
-[\s]?-[\s]?
-[\s]?–[\s]?
-[\s]?/[\s]?
- und (zum )?
- bis (zum )?
- oder (zum)?
+\s?[/–‒‑-]\s?
+ (?:und|bis|oder)(?:zum | )
diff --git a/resources/german/repattern/resources_repattern_reApproximate.txt b/resources/german/repattern/resources_repattern_reApproximate.txt
index 628a90a5..eb881b3e 100644
--- a/resources/german/repattern/resources_repattern_reApproximate.txt
+++ b/resources/german/repattern/resources_repattern_reApproximate.txt
@@ -7,6 +7,9 @@
[Ee]twa
[Uu]ngefähr
[Cc]irca
+[Gg]egen
+[Ff]rüh(?:en)
+[Ss]pät(?:en)
// less
[Nn]icht mehr als
[Nn]icht länger als
diff --git a/resources/german/repattern/resources_repattern_reDayNumberTh.txt b/resources/german/repattern/resources_repattern_reDayNumberTh.txt
index d86a9cbb..1b2f4721 100644
--- a/resources/german/repattern/resources_repattern_reDayNumberTh.txt
+++ b/resources/german/repattern/resources_repattern_reDayNumberTh.txt
@@ -3,4 +3,8 @@
// date: 2011-06-10
// This file contains regular expression patterns for day digit th.
// FORMAT: one line is one disjunction of the pattern
-DUMMY-FOR-NUMBERTH
\ No newline at end of file
+0[0-9]\.
+[1-9]\.
+1[0-9]\.
+2[0-9]\.
+3[01]\.
diff --git a/resources/german/repattern/resources_repattern_reHolidayFix.txt b/resources/german/repattern/resources_repattern_reHolidayFix.txt
index 4796cf48..4aa30026 100644
--- a/resources/german/repattern/resources_repattern_reHolidayFix.txt
+++ b/resources/german/repattern/resources_repattern_reHolidayFix.txt
@@ -10,7 +10,7 @@ Neujahrstag
Neujahrsfest
// http://de.wikipedia.org/wiki/Heilige_Drei_K%C3%B6nige
-[Hh]eilig(en?)? [Dd]rei König(en?)?
+[Hh]eilig(?:en?|) [Dd]rei König(?:en?|)
Epiphanias
Epiphanie
Erscheinung des Herrn
@@ -49,7 +49,7 @@ Allerheiligen
Weihnachten
Weihnachtsabend
Heiligabend
-[Hh]eilig(en)? Abend
+[Hh]eilig(?:en|) Abend
Weihnacht
[Hh]eiliges Christfest
Christfest
diff --git a/resources/german/repattern/resources_repattern_reMonthLong.txt b/resources/german/repattern/resources_repattern_reMonthLong.txt
index b62ee369..1478c099 100644
--- a/resources/german/repattern/resources_repattern_reMonthLong.txt
+++ b/resources/german/repattern/resources_repattern_reMonthLong.txt
@@ -3,27 +3,26 @@
// date: 2011-06-10
// This file contains regular expression patterns for long months.
// FORMAT: one line is one disjunction of the pattern
-[Jj]anuar(ii)?
+[Jj]anuar(?:ii|y|)
[Jj]änner
-[Ff]ebruar(ii)?
+[Ff]ebruar(?:ii|y|)
[Hh]ornung
[Mm]ärz
[Mm]aerz
+[Mm]arch
[Mm]erz
-[Mm]art(ii)?
-[Aa]pril(is)?
-[Mm]ai(i)?
+[Mm]art(?:ii)?
+[Aa]pril(?:is)?
+[Mm]aii?
[Mm]ay
-[Jj]uni(i)?
-[Jj]uli(i)?
-[Aa]ugust(i)?
+[Jj]un(?:ii?|y)
+[Jj]ul(?:ii?|y)
+[Aa]ugusti?
[Ss]eptember
-[Ss]eptembr(is|.)?
-[Oo]ktober
-[Oo]ctober
-[Oo]ctobr(is|.)
+[Ss]eptembr(?:is|\.)?
+[Oo][ck]tober
+[Oo]ctobr(?:is|\.)?
[Nn]ovember
-[Nn]ovembr(is|.)
-[Dd]ezember
-[Dd]ecember
-[Dd]ecembr(is|.)
+[Nn]ovembr(?:is|\.)?
+[Dd]e[cz]ember
+[Dd]ecembr(?:is|\.)?
diff --git a/resources/german/repattern/resources_repattern_reMonthShort.txt b/resources/german/repattern/resources_repattern_reMonthShort.txt
index f0000eac..399809af 100644
--- a/resources/german/repattern/resources_repattern_reMonthShort.txt
+++ b/resources/german/repattern/resources_repattern_reMonthShort.txt
@@ -7,11 +7,11 @@
[Jj]an
[Ff]eb\.
[Ff]eb
-[Mm]är\.
-[Mm]är
+[Mm][aä]r\.
+[Mm][aä]r
[Aa]pr\.
[Aa]pr
-[Mm]ai
+[Mm]a[iy]
[Jj]un\.
[Jj]un
[Jj]ul\.
@@ -22,9 +22,9 @@
[Ss]ep
[Ss]ept\.
[Ss]ept
-[Oo]kt\.
-[Oo]kt
+[Oo][ck]t\.
+[Oo][ck]t
[Nn]ov\.
[Nn]ov
-[Dd]ez\.
-[Dd]ez
\ No newline at end of file
+[Dd]e[cz]\.
+[Dd]e[cz]
diff --git a/resources/german/repattern/resources_repattern_rePartOfDay.txt b/resources/german/repattern/resources_repattern_rePartOfDay.txt
index 8d485d0e..5722da23 100644
--- a/resources/german/repattern/resources_repattern_rePartOfDay.txt
+++ b/resources/german/repattern/resources_repattern_rePartOfDay.txt
@@ -5,8 +5,8 @@
// FORMAT: one line is one disjunction of the pattern
[Vv]ormittag
[Nn]achmittag
-[Mm]ittag
+[Mm]ittag(?:sstunden|)
[Mm]itternacht
-[Nn]acht
-[Mm]orgen
-[Aa]bend
\ No newline at end of file
+[Nn]acht(?:stunden|)
+[Mm]orgen(?:stunden|)
+[Aa]bend(?:stunden|)
diff --git a/resources/german/repattern/resources_repattern_rePartOfYear.txt b/resources/german/repattern/resources_repattern_rePartOfYear.txt
index 2ccdd1da..77501b46 100644
--- a/resources/german/repattern/resources_repattern_rePartOfYear.txt
+++ b/resources/german/repattern/resources_repattern_rePartOfYear.txt
@@ -9,15 +9,12 @@
[Dd]ritte[ns]? Quartals?
[Vv]ierte[ns]? Quartals?
[Ll]etzte[ns]? Quartals?
-1\. Quartals?
-2\. Quartals?
-3\. Quartals?
-4\. Quartals?
+[1-4]\. Quartals?
// Jahreshälften
[Ee]rste[ns]? Hälfte
[Zz]weite[ns]? Hälfte
[Ll]etzte[ns]? Hälfte
-[Ee]rste[ns]? Halbjahr(es)?
-[Zz]weite[ns]? Halbjahr(es)?
+[Ee]rste[ns]? Halbjahr(?:es|)
+[Zz]weite[ns]? Halbjahr(?:es|)
[Ee]rste[ns]? Jahreshälfte
[Zz]weite[ns]? Jahreshälfte
\ No newline at end of file
diff --git a/resources/german/repattern/resources_repattern_reThisNextLast.txt b/resources/german/repattern/resources_repattern_reThisNextLast.txt
index 03402285..3e8f1019 100644
--- a/resources/german/repattern/resources_repattern_reThisNextLast.txt
+++ b/resources/german/repattern/resources_repattern_reThisNextLast.txt
@@ -5,6 +5,7 @@
// FORMAT: one line is one disjunction of the pattern
[Ll]etzte[rsnm]?
[Nn]ächste[rsnm]?
+[Nn]eue[rsnm]?
[Ff]olgende[rsnm]?
[Dd]iese[rsnm]?
[Vv]orige[rsnm]?
diff --git a/resources/german/repattern/resources_repattern_reTimeHour.txt b/resources/german/repattern/resources_repattern_reTimeHour.txt
index 9e5d33a6..ae842ddd 100644
--- a/resources/german/repattern/resources_repattern_reTimeHour.txt
+++ b/resources/german/repattern/resources_repattern_reTimeHour.txt
@@ -3,28 +3,6 @@
// date: 2011-06-10
// This file contains regular expression patterns for time hours.
// FORMAT: one line is one disjunction of the pattern
-10
-11
-12
-13
-14
-15
-16
-17
-18
-19
-20
-21
-22
-23
-24
-1
-2
-3
-4
-5
-6
-7
-8
-9
-0
\ No newline at end of file
+[01][0-9]?
+2[0-4]?
+[3-9]
diff --git a/resources/german/repattern/resources_repattern_reTimeMinute.txt b/resources/german/repattern/resources_repattern_reTimeMinute.txt
index 6e560f7a..15189854 100644
--- a/resources/german/repattern/resources_repattern_reTimeMinute.txt
+++ b/resources/german/repattern/resources_repattern_reTimeMinute.txt
@@ -3,4 +3,4 @@
// date: 2011-06-10
// This file contains regular expression patterns for time minutes.
// FORMAT: one line is one disjunction of the pattern
-[0|1|2|3|4|5][0-9]
\ No newline at end of file
+[0-5][0-9]
diff --git a/resources/german/repattern/resources_repattern_reUnit.txt b/resources/german/repattern/resources_repattern_reUnit.txt
index e4444820..238d911a 100644
--- a/resources/german/repattern/resources_repattern_reUnit.txt
+++ b/resources/german/repattern/resources_repattern_reUnit.txt
@@ -3,29 +3,15 @@
// date: 2011-06-10
// This file contains regular expression patterns for unit words.
// FORMAT: one line is one disjunction of the pattern
-[Hh]andelstagen
-[Hh]andelstage
-[Hh]andelstag
-[Tt]agen
-[Tt]age
-[Tt]ag
-[Ww]ochenenden
-[Ww]ochenende
-[Ww]ochen
-[Ww]oche
-[Mm]onaten
-[Mm]onate
-[Mm]onat
-[Qq]uartalen
-[Qq]uartale
-[Qq]uartal
-[Jj]ahren
-[Jj]ahre
-[Jj]ahr
-[Jj]ahrzehnten
-[Jj]ahrzehnte
-[Jj]ahrzehnt
-[Jj]ahrhunderten
-[Jj]ahrhunderte
-[Jj]ahrhundert
-Jh\.
\ No newline at end of file
+[Hh]andelstag(?:en|e|)
+[Tt]ag(?:en|e|)
+[Ww]ochenenden?
+[Ww]ochen?
+[Mm]onat(?:en|e|)
+[Qq]uartal(?:en|e|)
+[Jj]ahr(?:en|e|)
+[Jj]ahrzehnt(?:en|e|)
+[Jj]ahrhundert(?:en|e|)
+Jh\.
+Jhd\.
+Jhdt\.
diff --git a/resources/german/repattern/resources_repattern_reUnitFine.txt b/resources/german/repattern/resources_repattern_reUnitFine.txt
new file mode 100644
index 00000000..b1376915
--- /dev/null
+++ b/resources/german/repattern/resources_repattern_reUnitFine.txt
@@ -0,0 +1,20 @@
+// author: Jannik Strötgen
+// email: stroetgen@uni-hd.de
+// date: 2011-06-10
+// This file contains regular expression patterns for unit words.
+// FORMAT: one line is one disjunction of the pattern
+[Ss]tunden?
+[Mm]inuten?
+[Ss]ekunden?
+[Hh]andelstag(?:en|e|)
+[Tt]ag(?:en|e|)
+[Ww]ochenenden?
+[Ww]ochen?
+[Mm]onat(?:en|e|)
+[Qq]uartal(?:en|e|)
+[Jj]ahr(?:en|e|)
+[Jj]ahrzehnt(?:en|e|)
+[Jj]ahrhundert(?:en|e|)
+Jh\.
+Jhd\.
+Jhdt\.
diff --git a/resources/german/repattern/resources_repattern_reYearPrefix.txt b/resources/german/repattern/resources_repattern_reYearPrefix.txt
index 6afacbb3..26737e19 100644
--- a/resources/german/repattern/resources_repattern_reYearPrefix.txt
+++ b/resources/german/repattern/resources_repattern_reYearPrefix.txt
@@ -4,14 +4,14 @@
// This file contains regular expression patterns for year numbers (4 digits).
// FORMAT: one line is one disjunction of the pattern
BC
-B[\.]C[\.]
-B[\.]C
+B\.C\.
+B\.C
AD
-A[\.]D[\.]
-A[\.]D
-v[\.] Chr[\.]
-n[\.] Chr[\.]
-vor Chr[\.]
-nach Chr[\.]
+A\.D\.
+A\.D
+v\. Chr\.
+n\. Chr\.
+vor Chr\.
+nach Chr\.
vor Christus
nach Christus
\ No newline at end of file
diff --git a/resources/german/rules/resources_rules_daterules.txt b/resources/german/rules/resources_rules_daterules.txt
index 17c859df..d2c9a6aa 100644
--- a/resources/german/rules/resources_rules_daterules.txt
+++ b/resources/german/rules/resources_rules_daterules.txt
@@ -15,12 +15,12 @@
// EXAMPLE date_historic_1d-BCADhint: Anfang 190 v. Chr. (1- to 4-digit year)
// EXAMPLE date_historic_1e-BCADhint: Anfang v. Chr. 190 v. Chr. (1- to 4-digit year)
// EXAMPLE date_historic_1f-BCADhint: Anfang 190 bis 180 v. Chr. (find "Anfang 190 v. Chr."; 1- to 4-digit year)
-RULENAME="date_historic_1a-BCADhint",EXTRACTION="(%reApproximate )?(Jahr(e)?([ns])? )?%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(7))%normYearBC(group(6))"
-RULENAME="date_historic_1b-BCADhint",EXTRACTION="(%reApproximate )?(Jahr(e)?([ns])? )?%reYearPrefix %reYearBC",NORM_VALUE="%normYearPrefix(group(6))%normYearBC(group(7))"
-RULENAME="date_historic_1c-BCADhint",EXTRACTION="(%reApproximate )?(Jahr(e)?([ns])? )?%reYearBC%reAndOrTo%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(9))%normYearBC(group(6))",OFFSET="group(0)-group(6)"
-RULENAME="date_historic_1d-BCADhint",EXTRACTION="(%rePartWords )(Jahr(e)?([ns])? )?%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(7))%normYearBC(group(6))"
-RULENAME="date_historic_1e-BCADhint",EXTRACTION="(%rePartWords )(Jahr(e)?([ns])? )?%reYearPrefix %reYearBC",NORM_VALUE="%normYearPrefix(group(6))%normYearBC(group(7))"
-RULENAME="date_historic_1f-BCADhint",EXTRACTION="(%rePartWords )(Jahr(e)?([ns])? )?%reYearBC%reAndOrTo%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(9))%normYearBC(group(6))",OFFSET="group(0)-group(6)"
+RULENAME="date_historic_1a-BCADhint",EXTRACTION="(?:%reApproximate |)(?:Jahre?[ns]? )?%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(3))%normYearBC(group(2))"
+RULENAME="date_historic_1b-BCADhint",EXTRACTION="(?:%reApproximate |)(?:Jahre?[ns]? )?%reYearPrefix %reYearBC",NORM_VALUE="%normYearPrefix(group(2))%normYearBC(group(3))"
+RULENAME="date_historic_1c-BCADhint",EXTRACTION="(?:%reApproximate |)(?:Jahre?[ns]? )?%reYearBC%reAndOrTo%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))%normYearBC(group(2))",OFFSET="group(0)-group(2)"
+RULENAME="date_historic_1d-BCADhint",EXTRACTION="(?:%rePartWords )(?:Jahre?[ns]? )?%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(3))%normYearBC(group(2))"
+RULENAME="date_historic_1e-BCADhint",EXTRACTION="(?:%rePartWords )(?:Jahre?[ns]? )?%reYearPrefix %reYearBC",NORM_VALUE="%normYearPrefix(group(2))%normYearBC(group(3))"
+RULENAME="date_historic_1f-BCADhint",EXTRACTION="(?:%rePartWords )(?:Jahre?[ns]? )?%reYearBC%reAndOrTo%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))%normYearBC(group(2))",OFFSET="group(0)-group(2)"
// historic dates; month granularity
// EXAMPLE date_historic_2a-BCADhint: Januar 190 v. Chr. (1- to 4-digit year)
@@ -29,44 +29,44 @@ RULENAME="date_historic_1f-BCADhint",EXTRACTION="(%rePartWords )(Jahr(e)?([ns])?
// EXAMPLE date_historic_2d-BCADhint: Anfang Januar 190 v. Chr. (1- to 4-digit year)
// EXAMPLE date_historic_2e: Anfang Januar 190 (3-digit year)
// EXAMPLE date_historic_2f: Anfang Januar 90 (2-digit year)
-RULENAME="date_historic_2a-BCADhint",EXTRACTION="(%reApproximate )?(%reMonthLong|%reMonthShort)( )%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(8))%normYearBC(group(7))-%normMonth(group(3))"
-RULENAME="date_historic_2b",EXTRACTION="(%reApproximate )?(%reMonthLong|%reMonthShort)( )([\d][\d][\d])",NORM_VALUE="%normYearBC(group(7))-%normMonth(group(3))"
-RULENAME="date_historic_2c",EXTRACTION="(%reApproximate )?(%reMonthLong|%reMonthShort)( )([\d][\d])",NORM_VALUE="UNDEF-centurygroup(7)-%normMonth(group(3))"
-RULENAME="date_historic_2d-BCADhint",EXTRACTION="(%rePartWords )(%reMonthLong|%reMonthShort)( )%reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(8))%normYearBC(group(7))-%normMonth(group(3))"
-RULENAME="date_historic_2e",EXTRACTION="(%rePartWords )(%reMonthLong|%reMonthShort)( )([\d][\d][\d])",NORM_VALUE="%normYearBC(group(7))-%normMonth(group(3))"
-RULENAME="date_historic_2f",EXTRACTION="(%rePartWords )(%reMonthLong|%reMonthShort)( )([\d][\d])",NORM_VALUE="UNDEF-centurygroup(7)-%normMonth(group(3))"
+RULENAME="date_historic_2a-BCADhint",EXTRACTION="(?:%reApproximate |)%(reMonthLong|reMonthShort) %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normMonth(group(2))"
+RULENAME="date_historic_2b",EXTRACTION="(?:%reApproximate |)%(reMonthLong|reMonthShort) (\d\d\d)",NORM_VALUE="%normYearBC(group(3))-%normMonth(group(2))"
+RULENAME="date_historic_2c",EXTRACTION="(?:%reApproximate |)%(reMonthLong|reMonthShort) (\d\d)",NORM_VALUE="UNDEF-centurygroup(3)-%normMonth(group(2))"
+RULENAME="date_historic_2d-BCADhint",EXTRACTION="(?:%rePartWords )%(reMonthLong|reMonthShort) %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normMonth(group(2))"
+RULENAME="date_historic_2e",EXTRACTION="(?:%rePartWords )%(reMonthLong|reMonthShort) (\d\d\d)",NORM_VALUE="%normYearBC(group(3))-%normMonth(group(2))"
+RULENAME="date_historic_2f",EXTRACTION="(?:%rePartWords )%(reMonthLong|reMonthShort) (\d\d)",NORM_VALUE="UNDEF-centurygroup(3)-%normMonth(group(2))"
// historic dates; day granularity
// EXAMPLE date_historic_3a-BCADhint: 1. Januar 190 v. Chr. (1- to 4-digit year)
// EXAMPLE date_historic_3b: 1. Januar 190 (3-digit year)
// EXAMPLE date_historic_3c: 1. Januar 90 (2-digit year)
// EXAMPLE date_historic_3d: 1. - 15. Januar 90 (find "1. Januar 90"; 2-digit year)
-RULENAME="date_historic_3a-BCADhint",EXTRACTION="%reDayNumber[\.]? (%reMonthLong|%reMonthShort)( des Jahres| im Jahre)? %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(7))%normYearBC(group(6))-%normMonth(group(2))-%normDay(group(1))"
-RULENAME="date_historic_3b",EXTRACTION="%reDayNumber[\.]? (%reMonthLong|%reMonthShort)( des Jahres| im Jahre)? (\d\d\d)",NORM_VALUE="%normYearBC(group(6))-%normMonth(group(2))-%normDay(group(1))"
-RULENAME="date_historic_3c",EXTRACTION="%reDayNumber[\.]? (%reMonthLong|%reMonthShort)( des Jahres| im Jahre)? %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(6)-%normMonth(group(2))-%normDay(group(1))"
-RULENAME="date_historic_3d",EXTRACTION="(%reDayNumber[\.]?)%reAndOrTo%reDayNumber[\.]? (%reMonthLong|%reMonthShort)( des Jahres| im Jahre)? %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(9)-%normMonth(group(5))-%normDay(group(1))",OFFSET="group(1)-group(1)"
+RULENAME="date_historic_3a-BCADhint",EXTRACTION="%reDayNumber\.? %(reMonthLong|reMonthShort)(?: des Jahres| im Jahre|) %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normMonth(group(2))-%normDay(group(1))"
+RULENAME="date_historic_3b",EXTRACTION="%reDayNumber\.? %(reMonthLong|reMonthShort)(?: des Jahres| im Jahre|) (\d\d\d)",NORM_VALUE="%normYearBC(group(3))-%normMonth(group(2))-%normDay(group(1))"
+RULENAME="date_historic_3c",EXTRACTION="%reDayNumber\.? %(reMonthLong|reMonthShort)(?: des Jahres| im Jahre|) %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(3)-%normMonth(group(2))-%normDay(group(1))"
+RULENAME="date_historic_3d",EXTRACTION="(%reDayNumber\.?)%reAndOrTo%reDayNumber\.? %(reMonthLong|reMonthShort)(?: des Jahres| im Jahre|) %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(6)-%normMonth(group(5))-%normDay(group(1))",OFFSET="group(1)-group(1)"
// historic dates; season granularity
// EXAMPLE date_historic_4a-BCADhint: Winter 190 v. Chr. (1- to 4-digit year)
// EXAMPLE date_historic_4b-BCADhint: Mitte Winter 190 v.Chr. (1- to 4-digit year)
-RULENAME="date_historic_4a-BCADhint",EXTRACTION="(%reApproximate )?%reSeason %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))%normYearBC(group(4))-%normSeason(group(3))",
-RULENAME="date_historic_4b-BCADhint",EXTRACTION="(%rePartWords )%reSeason %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(5))%normYearBC(group(4))-%normSeason(group(3))",
+RULENAME="date_historic_4a-BCADhint",EXTRACTION="(?:%reApproximate |)%reSeason %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normSeason(group(2))",
+RULENAME="date_historic_4b-BCADhint",EXTRACTION="(?:%rePartWords )%reSeason %reYearBC %reYearPrefix",NORM_VALUE="%normYearPrefix(group(4))%normYearBC(group(3))-%normSeason(group(2))",
// historic dates; century granularity
// EXAMPLE date_historic_5a-BCADhint: das 5. Jahrhundert v. Chr.
-RULENAME="date_historic_5a-BCADhint",EXTRACTION="([Dd]as )?(%reDayNumber[\.]?|%reDayWordTh) (Jahrhundert[s]?|Jh\.) %reYearPrefix",NORM_VALUE="%normYearPrefix(group(6))%normDay(%SUM%(%normDay(group(2)),-1))"
+RULENAME="date_historic_5a-BCADhint",EXTRACTION="(?:[Dd]as )?%(reDayNumber|reDayNumberTh|reDayWordTh) J(?:ahrhunderts?|h\.) %reYearPrefix",NORM_VALUE="%normYearPrefix(group(2))%normDay(%SUM%(%normDay(group(1)),-1))"
// historic dates; year granularity
// EXAMPLE date_historic_6a: Jahr 90 (2-digit year)
// EXAMPLE date_historic_6b: Jahr 190 (3-digit year)
-RULENAME="date_historic_6a",EXTRACTION="(Jahr(e)?([ns])?) (\d\d)",NORM_VALUE="%normYearBC(group(4))"
-RULENAME="date_historic_6b",EXTRACTION="(Jahr(e)?([ns])?) (\d\d\d)",NORM_VALUE="%normYearBC(group(4))"
+RULENAME="date_historic_6a",EXTRACTION="Jahre?[ns]? (\d\d)",NORM_VALUE="UNDEF-centurygroup(1)"
+RULENAME="date_historic_6b",EXTRACTION="Jahre?[ns]? (\d\d\d)",NORM_VALUE="%normYearBC(group(1))"
// historic dates; negative rules
// EXAMPLE date_historic_0a_negative: MiG-190 (1- to 4-digit year)
// EXAMPLE date_historic_0b_negative: 90 Menschen (2-digit year)
-RULENAME="date_historic_0a_negative",EXTRACTION="(MiG(-)%reYearBC)",NORM_VALUE="REMOVE"
-RULENAME="date_historic_0b_negative",EXTRACTION="(%reYear2Digit )([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(3):NN:",OFFSET="group(1)-group(1)"
+RULENAME="date_historic_0a_negative",EXTRACTION="MiG-%reYearBC",NORM_VALUE="REMOVE"
+RULENAME="date_historic_0b_negative",EXTRACTION="%reYear2Digit (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):NN:",OFFSET="group(1)-group(1)"
////////////////////
// POSITIVE RULES //
@@ -81,53 +81,53 @@ RULENAME="date_historic_0b_negative",EXTRACTION="(%reYear2Digit )([\S]+)",NORM_V
// EXAMPLE r0c_1: 09/26/1999
// EXAMPLE r0d_1: 09/26/99
// EXAMPLE r0e_1: 7-14 (AP) (find 7-14)
-RULENAME="date_r0a",EXTRACTION="(%reYear4Digit-%reMonthNumber-%reDayNumber)",NORM_VALUE="group(1)"
-RULENAME="date_r0b",EXTRACTION="(%reMonthNumber-%reDayNumber-%reYear2Digit)",NORM_VALUE="UNDEF-centurygroup(4)-group(2)-group(3)"
+RULENAME="date_r0a",EXTRACTION="%reYear4Digit-%reMonthNumber-%reDayNumber",NORM_VALUE="group(0)"
+RULENAME="date_r0b",EXTRACTION="%reMonthNumber-%reDayNumber-%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(3)-group(1)-group(2)"
RULENAME="date_r0c",EXTRACTION="%reMonthNumber/%reDayNumber/%reYear4Digit",NORM_VALUE="group(3)-%normMonth(group(1))-%normDay(group(2))"
RULENAME="date_r0d",EXTRACTION="%reMonthNumber/%reDayNumber/%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(3)-%normMonth(group(1))-%normDay(group(2))"
-RULENAME="date_r0e",EXTRACTION="%reMonthNumber-%reDayNumber( \(.*?\))",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(2))",OFFSET="group(1)-group(2)"
+RULENAME="date_r0e",EXTRACTION="%reMonthNumber-%reDayNumber \(.*?\)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(2))",OFFSET="group(1)-group(2)"
// date_r1
// EXAMPLE r1a_1: 1.3.99
// EXAMPLE r1b_1: 1.3.1999
-RULENAME="date_r1a",EXTRACTION="%reDayNumber[\.]%reMonthNumber[\.]%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(3)-%normMonth(group(2))-%normDay(group(1))"
-RULENAME="date_r1b",EXTRACTION="%reDayNumber[\.]%reMonthNumber[\.]%reYear4Digit",NORM_VALUE="group(3)-%normMonth(group(2))-%normDay(group(1))"
+RULENAME="date_r1a",EXTRACTION="%reDayNumber\.%reMonthNumber\.%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(3)-%normMonth(group(2))-%normDay(group(1))"
+RULENAME="date_r1b",EXTRACTION="%reDayNumber\.%reMonthNumber\.%reYear4Digit",NORM_VALUE="group(3)-%normMonth(group(2))-%normDay(group(1))"
// date_r2 (keep though it is English date format)
// EXAMPLE r2a_1: Februar 25, 2009
// EXAMPLE r2a_2: Feb. 25, 2009
-RULENAME="date_r2a",EXTRACTION="(%reMonthLong|%reMonthShort) %reDayNumber[\s]?, %reYear4Digit",NORM_VALUE="group(5)-%normMonth(group(1))-%normDay(group(4))"
+RULENAME="date_r2a",EXTRACTION="%(reMonthLong|reMonthShort) %reDayNumber\s?, %reYear4Digit",NORM_VALUE="group(3)-%normMonth(group(1))-%normDay(group(2))"
// date_r3
// EXAMPLE r3a_1: 25. Februar 2009
// EXAMPLE r3a_2: 25 Feb 2009
// EXAMPLE r3a_3: 25 Feb. 2009
// EXAMPLE r3a_4: 25. November des Jahres 2001
-RULENAME="date_r3a",EXTRACTION="%reDayNumber[\.]? (%reMonthLong|%reMonthShort)( des Jahres| im Jahre)? %reYear4Digit",NORM_VALUE="group(6)-%normMonth(group(2))-%normDay(group(1))"
+RULENAME="date_r3a",EXTRACTION="%reDayNumber\.? %(reMonthLong|reMonthShort)(?: des Jahres| im Jahre|) %reYear4Digit",NORM_VALUE="group(3)-%normMonth(group(2))-%normDay(group(1))"
// date_r4
// EXAMPLE r4a_1: November 19
// EXAMPLE r4b_1: 19. November
// EXAMPLE r4c_1: November 15 - 18 (find November 18)
// EXAMPLE r4d_1: 19. und 20. Januar (find 19. Januar)
-RULENAME="date_r4a",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(4))"
-RULENAME="date_r4b",EXTRACTION="(%reDayWordTh|%reDayNumberTh|%reDayNumber[\.]?) (%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(5))-%normDay(group(1))"
-RULENAME="date_r4c",EXTRACTION="(%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber)(\–| - | – |-|–)(%reDayWordTh|%reDayNumberTh|%reDayNumber)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(9))",OFFSET="group(9)-group(9)"
-RULENAME="date_r4d",EXTRACTION="(%reDayWordTh|%reDayNumberTh|%reDayNumber[\.]?)%reAndOrTo(%reDayWordTh|%reDayNumberTh|%reDayNumber[.]?) (%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(10))-%normDay(group(1))",OFFSET="group(1)-group(1)"
+RULENAME="date_r4a",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(2))"
+RULENAME="date_r4b",EXTRACTION="%(reDayNumber|reDayNumberTh|reDayWordTh) %(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(2))-%normDay(group(1))"
+RULENAME="date_r4c",EXTRACTION="%(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh)\s*%reAndOrTo\s*%(reDayNumber|reDayNumberTh|reDayWordTh)",NORM_VALUE="UNDEF-year-%normMonth(group(1))-%normDay(group(4))",OFFSET="group(4)-group(4)"
+RULENAME="date_r4d",EXTRACTION="%(reDayNumber|reDayNumberTh|reDayWordTh)%reAndOrTo%(reDayNumber|reDayNumberTh|reDayWordTh) %(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(4))-%normDay(group(1))",OFFSET="group(1)-group(1)"
// date_r5
// EXAMPLE r5a_1: Freitag Oktober 13
// EXAMPLE r5b_1: Freitag 13. Oktober
-RULENAME="date_r5a",EXTRACTION="%reWeekday[,]? (%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber)",NORM_VALUE="UNDEF-year-%normMonth(group(2))-%normDay(group(5))"
-RULENAME="date_r5b",EXTRACTION="%reWeekday[,]? (%reDayWordTh|%reDayNumberTh|%reDayNumber)[\.]? (%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(6))-%normDay(group(2))"
+RULENAME="date_r5a",EXTRACTION="%reWeekday,? %(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh)",NORM_VALUE="UNDEF-year-%normMonth(group(2))-%normDay(group(3))"
+RULENAME="date_r5b",EXTRACTION="%reWeekday,? %(reDayNumber|reDayNumberTh|reDayWordTh)\.? %(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(3))-%normDay(group(2))"
// date_r6
// EXAMPLE r6a_1: 14. und 15. September 2010 (find: 14. September 2010)
-RULENAME="date_r6a",EXTRACTION="(%reDayNumberTh|%reDayNumber)[\.]?%reAndOrTo(%reDayNumberTh|%reDayNumber)[\.]? (%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(11)-%normMonth(group(8))-%normDay(group(1))",OFFSET="group(1)-group(1)"
+RULENAME="date_r6a",EXTRACTION="%(reDayNumberTh|reDayWordTh)%reAndOrTo%(reDayNumberTh|reDayWordTh) %(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(5)-%normMonth(group(4))-%normDay(group(1))",OFFSET="group(1)-group(1)"
// date_r7
// EXAMPLE r7a_1: Friday Oktober 13 2009
-RULENAME="date_r7a",EXTRACTION="%reWeekday[,]? (%reMonthLong|%reMonthShort) (%reDayWordTh|%reDayNumberTh|%reDayNumber)[,]? %reYear4Digit",NORM_VALUE="group(9)-%normMonth(group(2))-%normDay(group(5))"
+RULENAME="date_r7a",EXTRACTION="%reWeekday,? %(reMonthLong|reMonthShort) %(reDayNumber|reDayNumberTh|reDayWordTh),? %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(2))-%normDay(group(3))"
// date_r8
// EXAMPLE 8a_1: tomorrow
@@ -144,8 +144,8 @@ RULENAME="date_r9a",EXTRACTION="%reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(g
// EXAMPLE r10a_1: November 2001
// EXAMPLE r10a_2: Nov. 2001
// EXAMPLE r10b_1: Mai and Juni 2011 (find Mai 2001)
-RULENAME="date_r10a",EXTRACTION="(%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(1))"
-RULENAME="date_r10b",EXTRACTION="(%reMonthLong|%reMonthShort)%reAndOrTo(%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(8)-%normMonth(group(1))",OFFSET="group(1)-group(1)"
+RULENAME="date_r10a",EXTRACTION="(?:%(reMonthLong|reMonthShort)%reAndOrTo)?%(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(3))",OFFSET="group(3)-group(4)"
+RULENAME="date_r10b",EXTRACTION="%(reMonthLong|reMonthShort)%reAndOrTo%(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(1))",OFFSET="group(1)-group(1)"
// date_r11
// EXAMPLE r11a_1: November diesen Jahres
@@ -159,8 +159,8 @@ RULENAME="date_r11a",EXTRACTION="%reMonthLong %reThisNextLast Jahr[es]*",NORM_VA
// EXAMPLE r12b_1: Sommer 2001
// EXAMPLE r12c_1: Sommer 69
RULENAME="date_r12a",EXTRACTION="%reSeason",NORM_VALUE="UNDEF-year-%normSeason(group(1))"
-RULENAME="date_r12b",EXTRACTION="%reSeason( des Jahres | )%reYear4Digit",NORM_VALUE="group(3)-%normSeason(group(1))"
-RULENAME="date_r12c",EXTRACTION="%reSeason( des Jahres | )%reYear2Digit",NORM_VALUE="UNDEF-centurygroup(3)-%normSeason(group(1))"
+RULENAME="date_r12b",EXTRACTION="%reSeason(?: des Jahres|) %reYear4Digit",NORM_VALUE="group(2)-%normSeason(group(1))"
+RULENAME="date_r12c",EXTRACTION="%reSeason(?: des Jahres|) %reYear2Digit",NORM_VALUE="UNDEF-centurygroup(2)-%normSeason(group(1))"
//////////////////////////////
// PART-OF-YEAR GRANULARITY //
@@ -168,9 +168,9 @@ RULENAME="date_r12c",EXTRACTION="%reSeason( des Jahres | )%reYear2Digit",NORM_VA
// date_r13
// EXAMPLE r13a_1: das erste Quartal 2001
// EXAMPLE r13a_1: das erste Quartal
-RULENAME="date_r13a",EXTRACTION="([Dd]as |[Dd]er |[Dd]ie )?%rePartOfYear (des Jahres )?%reYear4Digit",NORM_VALUE="group(4)-%normPartOfYear(group(2))"
-RULENAME="date_r13b",EXTRACTION="([Dd]as |[Dd]er |[Dd]ie )?%rePartOfYear",NORM_VALUE="UNDEF-year-%normPartOfYear(group(2))"
-RULENAME="date_r13c",EXTRACTION="([Dd]as |[Dd]er |[Dd]ie )?%rePartOfYear des Vorjahres",NORM_VALUE="UNDEF-REF-year-MINUS-1%normPartOfYear(group(2))"
+RULENAME="date_r13a",EXTRACTION="(?:[Dd]as |[Dd]er |[Dd]ie |)%rePartOfYear (des Jahres )?%reYear4Digit",NORM_VALUE="group(3)-%normPartOfYear(group(1))"
+RULENAME="date_r13b",EXTRACTION="(?:[Dd]as |[Dd]er |[Dd]ie |)%rePartOfYear",NORM_VALUE="UNDEF-year-%normPartOfYear(group(1))"
+RULENAME="date_r13c",EXTRACTION="(?:[Dd]as |[Dd]er |[Dd]ie |)%rePartOfYear des Vorjahres",NORM_VALUE="UNDEF-REF-year-MINUS-1%normPartOfYear(group(1))"
//////////////////////
// Year Granularity //
@@ -178,7 +178,7 @@ RULENAME="date_r13c",EXTRACTION="([Dd]as |[Dd]er |[Dd]ie )?%rePartOfYear des Vor
// date_r14
// EXAMPLE r14a_1: 2009
// EXAMPLE r14a_2: Jahr 2009
-RULENAME="date_r14a",EXTRACTION="(Jahr[es]* )?%reYear4Digit",NORM_VALUE="group(2)"
+RULENAME="date_r14a",EXTRACTION="(?:Jahr[es]* |)%reYear4Digit",NORM_VALUE="group(1)"
//date_r15
// EXAMPLE r15a_1: 1850-58 (find: 1858)
@@ -187,7 +187,7 @@ RULENAME="date_r15a",EXTRACTION="%reYear4Digit%reAndOrTo%reYear2Digit",NORM_VALU
// date_r16
// EXAMPLE r16a_1: neunzehnsechsundneuzig
-RULENAME="date_r16a",EXTRACTION="%reNumWordTeen( |-|)%reNumWord2D",NORM_VALUE="%normDurationNumber(group(1))%normDurationNumber(group(3))"
+RULENAME="date_r16a",EXTRACTION="%reNumWordTeen(?: |-|)%reNumWord2D",NORM_VALUE="%normDurationNumber(group(1))%normDurationNumber(group(2))"
/////////////////////////
// Century Granularity //
@@ -196,8 +196,9 @@ RULENAME="date_r16a",EXTRACTION="%reNumWordTeen( |-|)%reNumWord2D",NORM_VALUE="%
// EXAMPLE r17a_1: Das 20. Jahrhundert
// EXAMPLE r17b_1: Im 18. und 19. Jahrhundert (find: 17. Jahrhundert)
// EXAMPLE 2: the seventh century
-RULENAME="date_r17a",EXTRACTION="(Das )?(%reDayNumber[\.]?|%reDayWordTh) Jahrhundert[s]?",NORM_VALUE="%normDay(%SUM%(%normDay(group(2)),-1))"
-RULENAME="date_r17b",EXTRACTION="(Das )?(%reDayNumber[\.]?|%reDayWordTh)%reAndOrTo(%reDayNumber[\.]?|%reDayWordTh) Jahrhundert[s]?",NORM_VALUE="%normDay(%SUM%(%normDay(group(2)),-1))",OFFSET="group(2)-group(2)"
+RULENAME="date_r17a",EXTRACTION="(?:Das )?%(reDayNumber|reDayNumberTh|reDayWordTh) Jahrhunderts?",NORM_VALUE="%normDay(%SUM%(%normDay(group(1)),-1))"
+RULENAME="date_r17b",EXTRACTION="(?:Das )?%(reDayNumber|reDayNumberTh|reDayWordTh)%reAndOrTo%(reDayNumber|reDayNumberTh|reDayWordTh) Jahrhunderts?",NORM_VALUE="%normDay(%SUM%(%normDay(group(1)),-1))",OFFSET="group(1)-group(1)"
+RULENAME="date_r17c",EXTRACTION="%rePartWords des %(reDayNumber|reDayNumberTh|reDayWordTh) Jahrhunderts?",NORM_VALUE="%normDay(%SUM%(%normDay(group(2)),-1))",NORM_MOD="%normPartWords(group(1))"
///////////////////////////////////
// GRANULARITY INDEPENDENT RULES //
@@ -207,22 +208,22 @@ RULENAME="date_r17b",EXTRACTION="(Das )?(%reDayNumber[\.]?|%reDayWordTh)%reAndOr
// EXAMPLE r18b_1: Anfang 1999
// EXAMPLE r18c_1: Anfang November 1999
// EXAMPLE r18d_1: Anfang November 2000
-RULENAME="date_r18a",EXTRACTION="(%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(1))"
-RULENAME="date_r18b",EXTRACTION="%rePartWords( |)%reYear4Digit",NORM_VALUE="group(3)",NORM_MOD="%normPartWords(group(1))"
-RULENAME="date_r18c",EXTRACTION="%rePartWords( |)(%reMonthLong|%reMonthShort) %reYear4Digit",NORM_VALUE="group(6)-%normMonth(group(3))",NORM_MOD="%normPartWords(group(1))"
-RULENAME="date_r18d",EXTRACTION="%rePartWords( |)(%reMonthLong|%reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(3))",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_r18a",EXTRACTION="(?:%reApproximate |)%(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(2))"
+RULENAME="date_r18b",EXTRACTION="(?:%reApproximate |)%rePartWords ?%reYear4Digit",NORM_VALUE="group(3)",NORM_MOD="%normPartWords(group(2))"
+RULENAME="date_r18c",EXTRACTION="(?:%reApproximate |)%rePartWords ?%(reMonthLong|reMonthShort) %reYear4Digit",NORM_VALUE="group(4)-%normMonth(group(3))",NORM_MOD="%normPartWords(group(2))"
+RULENAME="date_r18d",EXTRACTION="(?:%reApproximate |)%rePartWords ?%(reMonthLong|reMonthShort)",NORM_VALUE="UNDEF-year-%normMonth(group(3))",NORM_MOD="%normPartWords(group(2))"
// date_r19
// EXAMPLE r19a_1: die 1920er Jahre
// EXAMPLE r19b_1: die 20er Jahre
// EXAMPLE r19a_1: die frühen 1920er Jahre
// EXAMPLE r19b_1: die frühen 20er Jahre
-RULENAME="date_r19a",EXTRACTION="([Dd]ie |[Dd]en )?%reYear4Digit(ern|er)( Jahren?|-Jahren?)?",NORM_VALUE="%SUBSTRING%(group(2),0,3)"
-RULENAME="date_r19b",EXTRACTION="([Dd]ie |[Dd]en )?%reYear2Digit(ern|er)( Jahren?|-Jahren?)?",NORM_VALUE="UNDEF-century%SUBSTRING%(group(2),0,1)"
-RULENAME="date_r19c",EXTRACTION="([Dd]ie |[Dd]en )?%rePartWords([ ]?)%reYear4Digit(ern|er)( Jahren?|-Jahren?)?",NORM_VALUE="%SUBSTRING%(group(4),0,3)",NORM_MOD="%normPartWords(group(2))"
-RULENAME="date_r19d",EXTRACTION="([Dd]ie |[Dd]en )?%rePartWords([ ]?)%reYear2Digit(ern|er)( Jahren?|-Jahren?)?",NORM_VALUE="UNDEF-century%SUBSTRING%(group(4),0,1)",NORM_MOD="%normPartWords(group(2))"
-RULENAME="date_r19e",EXTRACTION="([Dd]ie |[Dd]en )?%reDecadeWord( [Jj]ahren?|jahren?)?",NORM_VALUE="UNDEF-century%normDecadeWord(group(2))"
-RULENAME="date_r19f",EXTRACTION="([Dd]ie |[Dd]en )?%rePartWords der %reDecadeWord( [Jj]ahren?|jahren?)?",NORM_VALUE="UNDEF-century%normDecadeWord(group(3))",NORM_MOD="%normPartWords(group(2))"
+RULENAME="date_r19a",EXTRACTION="(?:[Dd]ie |[Dd]en )?%reYear4Digit(?:ern|er)(?: Jahren?|-Jahren?|)",NORM_VALUE="%SUBSTRING%(group(1),0,3)"
+RULENAME="date_r19b",EXTRACTION="(?:[Dd]ie |[Dd]en )?%reYear2Digit(?:ern|er)(?: Jahren?|-Jahren?|)",NORM_VALUE="UNDEF-century%SUBSTRING%(group(1),0,1)"
+RULENAME="date_r19c",EXTRACTION="(?:[Dd]ie |[Dd]en )?%rePartWords ?%reYear4Digit(?:ern|er)(?: Jahren?|-Jahren?|)",NORM_VALUE="%SUBSTRING%(group(2),0,3)",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_r19d",EXTRACTION="(?:[Dd]ie |[Dd]en )?%rePartWords ?%reYear2Digit(?:ern|er)(?: Jahren?|-Jahren?|)",NORM_VALUE="UNDEF-century%SUBSTRING%(group(2),0,1)",NORM_MOD="%normPartWords(group(1))"
+RULENAME="date_r19e",EXTRACTION="(?:[Dd]ie |[Dd]en )?%reDecadeWord(?: [Jj]ahren?|jahren?|)",NORM_VALUE="UNDEF-century%normDecadeWord(group(1))"
+RULENAME="date_r19f",EXTRACTION="(?:[Dd]ie |[Dd]en )?%rePartWords der %reDecadeWord(?: [Jj]ahren?|-[Jj]ahren?|)",NORM_VALUE="UNDEF-century%normDecadeWord(group(2))",NORM_MOD="%normPartWords(group(1))"
// date_r20
// EXAMPLE r20a_1: dieses Jahr
@@ -230,8 +231,8 @@ RULENAME="date_r19f",EXTRACTION="([Dd]ie |[Dd]en )?%rePartWords der %reDecadeWor
// EXAMPLE r20c_1: diesen November
// EXAMPLE r20d_1: diesen Montag
// EXAMPLE r20e_1: diesen Sommer
-RULENAME="date_r20a",EXTRACTION="([Dd]er |[Dd]ie |[Dd]as )?%reThisNextLast %reUnit",NORM_VALUE="UNDEF-%normThisNextLast(group(2))-%normUnit(group(3))"
-RULENAME="date_r20b",EXTRACTION="([Ss]elbe[nrs]?|[Gg]leiche[nrs]?) Tag",NORM_VALUE="UNDEF-REF-day-PLUS-0"
+RULENAME="date_r20a",EXTRACTION="(?:[Dd]er |[Dd]ie |[Dd]as |[Dd]es |[Ii]m |[Aa]m )?%reThisNextLast %reUnit(?:e?s?)",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-%normUnit(group(2))"
+RULENAME="date_r20b",EXTRACTION="(?:[Aa]m |)(?:[Ss]elbe[nrs]?|[Gg]leiche[nrs]?) Tag",NORM_VALUE="UNDEF-REF-day-PLUS-0"
RULENAME="date_r20c",EXTRACTION="%reThisNextLast %reMonthLong",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-%normMonthToEnglish(group(2))"
RULENAME="date_r20d",EXTRACTION="%reThisNextLast %reWeekday",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-%normWeekday(group(2))"
RULENAME="date_r20e",EXTRACTION="%reThisNextLast %reSeason",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-%normSeason(group(2))"
@@ -248,19 +249,19 @@ RULENAME="date_r21d",EXTRACTION="%rePartWords %reThisNextLast %reSeason[nes]*",N
// date_r22
// EXAMPLE r22a_1: letztes Wochenende
-RULENAME="date_r22a",EXTRACTION="([Dd]ieses |[Ll]etztes )Wochenende",NORM_VALUE="UNDEF-last-week-WE"
+RULENAME="date_r22a",EXTRACTION="(?:[Dd]ieses |[Ll]etztes )Wochenende",NORM_VALUE="UNDEF-last-week-WE"
// date_r23
// EXAMPLE r23a_1: das letztjährige Quartal
// EXAMPLE r23b_1: das Quartal
-RULENAME="date_r23a",EXTRACTION="([Dd]er|[Dd]ie|[Dd]as) (letztjährige) Quartals?",NORM_VALUE="UNDEF-REF-quarter-MINUS-4"
-RULENAME="date_r23b",EXTRACTION="([Dd]er|[Dd]ie|[Dd]as) Quartals?",NORM_VALUE="UNDEF-REF-quarter-PLUS-0"
+RULENAME="date_r23a",EXTRACTION="(?:[Dd]er|[Dd]ie|[Dd]as) letztjährige Quartals?",NORM_VALUE="UNDEF-REF-quarter-MINUS-4"
+RULENAME="date_r23b",EXTRACTION="(?:[Dd]er|[Dd]ie|[Dd]as) Quartals?",NORM_VALUE="UNDEF-REF-quarter-PLUS-0"
// date_r24
// EXAMPLE r24a_1: ein Jahr früher
// EXAMPLE r24b_2: ein Jahr später
-RULENAME="date_r24a",EXTRACTION="[Ee]in Jahr (früher|vorher|davor)",NORM_VALUE="UNDEF-REF-year-MINUS-1"
-RULENAME="date_r24b",EXTRACTION="[Ee]in Jahr (später|danach)",NORM_VALUE="UNDEF-REF-year-PLUS-1"
+RULENAME="date_r24a",EXTRACTION="[Ee]in Jahr (?:früher|vorher|davor|vor)",NORM_VALUE="UNDEF-REF-year-MINUS-1"
+RULENAME="date_r24b",EXTRACTION="[Ee]in Jahr (?:später|danach|nach)",NORM_VALUE="UNDEF-REF-year-PLUS-1"
// date_r25
// EXAMPLE r25a_1: etwa zehn Tage später
@@ -269,12 +270,12 @@ RULENAME="date_r24b",EXTRACTION="[Ee]in Jahr (später|danach)",NORM_VALUE="UNDEF
// EXAMPLE r25d_1: etwa zehn Tage früher
// EXAMPLE r25e_1: etwa 20 Tage früher
// EXAMPLE r25f_1: etwa ein Tag früher
-RULENAME="date_r26a",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D) (%reUnit|Minuten|Stunden) (später|danach)",NORM_VALUE="UNDEF-REF-%normUnit(group(6))-PLUS-%normDurationNumber(group(3))"
-RULENAME="date_r26b",EXTRACTION="(%reApproximate )?([\d]+) (%reUnit|Minuten|Stunden) (später|danach)",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-PLUS-group(3)"
-RULENAME="date_r26c",EXTRACTION="(%reApproximate )?([Ee]inen|[Ee]ine|[Ee]in) (%reUnit) (später|danach)",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-PLUS-1"
-RULENAME="date_r26d",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D) (%reUnit|Minuten|Stunden) (früher|vorher|davor)",NORM_VALUE="UNDEF-REF-%normUnit(group(6))-MINUS-%normDurationNumber(group(3))"
-RULENAME="date_r26e",EXTRACTION="(%reApproximate )?([\d]+) (%reUnit|Minuten|Stunden) (früher|vorher|davor)",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-MINUS-group(3)"
-RULENAME="date_r26f",EXTRACTION="(%reApproximate )?([Ee]inen|[Ee]ine|[Ee]in) (%reUnit) (früher|vorher|davor)",NORM_VALUE="UNDEF-REF-%normUnit(group(4))-MINUS-1"
+RULENAME="date_r26a",EXTRACTION="(?:%reApproximate |)%(reNumWord1D|reNumWord2D) %reUnitFine (?:später|danach|nach)",NORM_VALUE="UNDEF-REF-%normUnit(group(3))-PLUS-%normDurationNumber(group(2))"
+RULENAME="date_r26b",EXTRACTION="(?:%reApproximate |)(\d+) %reUnitFine (?:später|danach|nach)",NORM_VALUE="UNDEF-REF-%normUnit(group(3))-PLUS-group(2)"
+RULENAME="date_r26c",EXTRACTION="(?:%reApproximate |)(?:[Ee]inen|[Ee]ine|[Ee]in) %reUnitFine (?:später|danach|nach)",NORM_VALUE="UNDEF-REF-%normUnit(group(2))-PLUS-1"
+RULENAME="date_r26d",EXTRACTION="(?:%reApproximate |)%(reNumWord1D|reNumWord2D) %reUnitFine (?:früher|vorher|davor|vor)",NORM_VALUE="UNDEF-REF-%normUnit(group(3))-MINUS-%normDurationNumber(group(2))"
+RULENAME="date_r26e",EXTRACTION="(?:%reApproximate |)(\d+) %reUnitFine (?:früher|vorher|davor|vor)",NORM_VALUE="UNDEF-REF-%normUnit(group(3))-MINUS-group(2)"
+RULENAME="date_r26f",EXTRACTION="(?:%reApproximate |)(?:[Ee]inen|[Ee]ine|[Ee]in) %reUnitFine (?:früher|vorher|davor|vor)",NORM_VALUE="UNDEF-REF-%normUnit(group(2))-MINUS-1"
///////////////////
// HOLIDAY RULES //
@@ -298,27 +299,30 @@ RULENAME="date_r28c",EXTRACTION="%reHolidayVar %reYear2Digit",NORM_VALUE="UNDEF-
// PAST, PRESENT, FUTURE expressions
-RULENAME="date_r29a",EXTRACTION="([Ee]inigen?|[Ww]enigen?|[Vv]ielen?) %reUnit (später)",NORM_VALUE="FUTURE_REF"
-RULENAME="date_r29b",EXTRACTION="([Ee]inigen?|[Ww]enigen?|[Vv]ielen?) %reUnit (früher)",NORM_VALUE="PAST_REF"
+RULENAME="date_r29a",EXTRACTION="(?:[Ee]inigen?|[Ww]enigen?|[Vv]ielen?) %reUnit später",NORM_VALUE="FUTURE_REF"
+RULENAME="date_r29b",EXTRACTION="(?:[Ee]inigen?|[Ww]enigen?|[Vv]ielen?) %reUnit früher",NORM_VALUE="PAST_REF"
////////////////////
// NEGATIVE RULES //
////////////////////
// CHECK THESE RULES (the German tag set is not equal to the English tag set)
-RULENAME="date_r1a_negative",EXTRACTION="(2[3456789]\d\d)",NORM_VALUE="REMOVE"
-RULENAME="date_r1b1_negative",EXTRACTION="%reYear4Digit ([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):NN:"
-RULENAME="date_r1b2_negative",EXTRACTION="%reYear4Digit (respektive|von|oder) (%reYear4Digit|[\d]+) ([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(5):NN:"
-RULENAME="date_r1c1_negative",EXTRACTION="%reYear4Digit ([\S]+) ([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):ADJA:group(3):NN:"
-RULENAME="date_r1c2_negative",EXTRACTION="%reYear4Digit ([\S]+) ([\S]+) ([\S]+) ([\S]+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):ADJA:group(3):KON:group(4):ADJA:group(5):NN:"
-RULENAME="date_r1d_negative",EXTRACTION="%reYear4Digit (m\b|km\b|ft\b|yr\b|dg\b|cm\b|ha\b|sq\b|PS\b)",NORM_VALUE="REMOVE"
+RULENAME="date_r1a_negative",EXTRACTION="2[3456789]\d\d",NORM_VALUE="REMOVE"
+// March 2017 reduced range: many missing matches e.g. in Heidelberg University Wiki article.
+RULENAME="date_r1b1_negative",EXTRACTION="(?:1[0-7]\d\d|0\d\d\d|2[1-9]\d\d) (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):NN:"
+RULENAME="date_r1c1_negative",EXTRACTION="(?:1[0-7]\d\d|0\d\d\d|2[1-9]\d\d) (\S+) (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):ADJA:group(2):NN:"
+RULENAME="date_r1c2_negative",EXTRACTION="(?:1[0-7]\d\d|0\d\d\d|2[1-9]\d\d) (\S+) (\S+) (\S+) (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(1):ADJA:group(2):KON:group(3):ADJA:group(4):NN:"
+//RULENAME="date_r1b1_negative",EXTRACTION="%reYear4Digit (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):NN:"
+RULENAME="date_r1b2_negative",EXTRACTION="%reYear4Digit (?:respektive|von|oder) \d+ (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):NN:"
+//RULENAME="date_r1c1_negative",EXTRACTION="%reYear4Digit (\S+) (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):ADJA:group(3):NN:"
+//RULENAME="date_r1c2_negative",EXTRACTION="%reYear4Digit (\S+) (\S+) (\S+) (\S+)",NORM_VALUE="REMOVE",POS_CONSTRAINT="group(2):ADJA:group(3):KON:group(4):ADJA:group(5):NN:"
+RULENAME="date_r1d_negative",EXTRACTION="%reYear4Digit (?:m\b|km\b|ft\b|yr\b|dg\b|cm\b|ha\b|sq\b|PS\b|Euro)",NORM_VALUE="REMOVE"
// Further negative rules (March 2015, Jannik Strötgen)
// typical fairy tail expression
// EXAMPLE r1e1_negative: Märchen aus 1001 Nacht
// EXAMPLE r1e2_negative: 1001-Nacht
-RULENAME="date_r1e1_negative",EXTRACTION="Märchen aus 1001 Nacht",NORM_VALUE="REMOVE"
-RULENAME="date_r1e2_negative",EXTRACTION="1001(-| )Nacht",NORM_VALUE="REMOVE"
+RULENAME="date_r1e1_negative",EXTRACTION="(?:Märchen aus)? 1001[ -]Nacht",NORM_VALUE="REMOVE"
// Further negative rules (March 2015, Jannik Strötgen)
// Addresses and room numbers
@@ -326,20 +330,20 @@ RULENAME="date_r1e2_negative",EXTRACTION="1001(-| )Nacht",NORM_VALUE="REMOVE"
// EXAMPLE r2b_negative: 1010 Wien
// EXAMPLE r2c_negative: 1600 Pennsylvania Avenue
// EXAMPLE r2d_negative: Sitzungssaal 1901
-RULENAME="date_r2a_negative",EXTRACTION="([A-Z][\S]+)?([Ss]traße|[Ss]trasse|[Gg]asse|[Ww]eg) (\d)+, %reYear4Digit ([A-Z][\S]+\b)",NORM_VALUE="REMOVE"
-RULENAME="date_r2b_negative",EXTRACTION="%reYear4Digit (Wien|Graz|Linz|Salzburg|Innsbruck)",NORM_VALUE="REMOVE"
-RULENAME="date_r2c_negative",EXTRACTION="%reYear4Digit [A-Z]([\S]+) (Avenue|Street)",NORM_VALUE="REMOVE"
-RULENAME="date_r2d_negative",EXTRACTION="([A-Z][\S]+)?([Ss]aal|[Rr]aum|[Zz]immer) %reYear4Digit",NORM_VALUE="REMOVE"
+RULENAME="date_r2a_negative",EXTRACTION="(?:[A-Z]\S+)?(?:[Ss]traße|[Ss]trasse|[Gg]asse|[Ww]eg) \d+, %reYear4Digit [A-Z]\S+\b",NORM_VALUE="REMOVE"
+RULENAME="date_r2b_negative",EXTRACTION="%reYear4Digit (?:Wien|Graz|Linz|Salzburg|Innsbruck)",NORM_VALUE="REMOVE"
+RULENAME="date_r2c_negative",EXTRACTION="%reYear4Digit [A-Z]\S+ (?:Avenue|Street|Ave)",NORM_VALUE="REMOVE"
+RULENAME="date_r2d_negative",EXTRACTION="(?:[A-Z]\S+)?(?:[Ss]aal|[Rr]aum|[Zz]immer) %reYear4Digit",NORM_VALUE="REMOVE"
// Further negative rules (March 2015, Jannik Strötgen)
// EXAMPLE r3a_negative: 1200 davon [sind tot]
// EXAMPLE r3b_negative: mindestens 2000 [sind tot]
// EXAMPLE r3c_negative: von 2000 auf 1800 [reduziert]
-RULENAME="date_r3a_negative",EXTRACTION="%reYear4Digit (davon)",NORM_VALUE="REMOVE"
-RULENAME="date_r3b_negative",EXTRACTION="([Üü]ber|[Dd]arunter|[Dd]avon|[Kk]napp|[Ww]eniger als|[Mm]ehr als|[Rr]und|[Mm]indestens|[Hh]öchstens|[Mm]aximal|[Ww]eitere) %reYear4Digit",NORM_VALUE="REMOVE"
-RULENAME="date_r3c_negative",EXTRACTION="(von|um) (%reYear4Digit|[\d]+) (auf) (%reYear4Digit|[\d]+)",NORM_VALUE="REMOVE"
+RULENAME="date_r3a_negative",EXTRACTION="%reYear4Digit davon",NORM_VALUE="REMOVE"
+RULENAME="date_r3b_negative",EXTRACTION="(?:[Üü]ber|[Dd]arunter|[Dd]avon|[Kk]napp|[Ww]eniger als|[Mm]ehr als|[Rr]und|[Mm]indestens|[Hh]öchstens|[Mm]aximal|[Ww]eitere) %reYear4Digit",NORM_VALUE="REMOVE"
+RULENAME="date_r3c_negative",EXTRACTION="(?:von|um) \d+ auf \d+",NORM_VALUE="REMOVE"
// Further negative rules (March 2015, Jannik Strötgen)
// EXAMPLE r4a_negative: UN Resolution 1441
-RULENAME="date_r4a_negative",EXTRACTION="([Rr]esolution|[Ee]ntschließung) %reYear4Digit",NORM_VALUE="REMOVE"
+RULENAME="date_r4a_negative",EXTRACTION="(?:[Rr]esolution|[Ee]ntschließung) %reYear4Digit",NORM_VALUE="REMOVE"
diff --git a/resources/german/rules/resources_rules_durationrules.txt b/resources/german/rules/resources_rules_durationrules.txt
index 9e334e63..8513a366 100644
--- a/resources/german/rules/resources_rules_durationrules.txt
+++ b/resources/german/rules/resources_rules_durationrules.txt
@@ -10,16 +10,16 @@
// duration_r1
// EXAMPLE r1a_1: etwa fünf Tage
// EXAMPLE r1b_1: etwa 20 Tage
-// EXAMPLE r1c_1: etwa fünf Stunden
-// EXAMPLE r1d_1: etwa 20 Stunden
-RULENAME="duration_r1a1",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D) %reUnit",NORM_VALUE="P%normDurationNumber(group(3))%normUnit4Duration(group(6))"
-RULENAME="duration_r1b1",EXTRACTION="(%reApproximate )?([\d]+) %reUnit",NORM_VALUE="Pgroup(3)%normUnit4Duration(group(4))"
-RULENAME="duration_r1c1",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D) (Minuten?|Stunden?)",NORM_VALUE="PT%normDurationNumber(group(3))%normUnit4Duration(group(6))"
-RULENAME="duration_r1d1",EXTRACTION="(%reApproximate )?([\d]+) (Minuten?|Stunden?)",NORM_VALUE="PTgroup(3)%normUnit4Duration(group(4))"
-RULENAME="duration_r1a2",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D)%reAndOrTo(%reNumWord2D|%reNumWord1D) %reUnit",NORM_VALUE="P%normDurationNumber(group(3))%normUnit4Duration(group(10))",OFFSET="group(0)-group(3)"
-RULENAME="duration_r1b2",EXTRACTION="(%reApproximate )?([\d]+)%reAndOrTo([\d]+) %reUnit",NORM_VALUE="Pgroup(3)%normUnit4Duration(group(6))",OFFSET="group(0)-group(3)"
-RULENAME="duration_r1c2",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D)%reAndOrTo(%reNumWord2D|%reNumWord1D) (Minuten?|Stunden?)",NORM_VALUE="PT%normDurationNumber(group(3))%normUnit4Duration(group(10))"
-RULENAME="duration_r1d2",EXTRACTION="(%reApproximate )?([\d]+)%reAndOrTo([\d]+) (Minuten?|Stunden?)",NORM_VALUE="PTgroup(3)%normUnit4Duration(group(6))"
+// EXAMPLE r1a_1: etwa fünf Stunden
+// EXAMPLE r1b_1: etwa 20 Stunden
+RULENAME="duration_r1a1",EXTRACTION="(?:Nach |)(?:[Dd]en |%reApproximate |)%(reNumWord1D|reNumWord2D) %reUnit",NORM_VALUE="P%normDurationNumber(group(2))%normUnit4Duration(group(3))"
+RULENAME="duration_r1b1",EXTRACTION="(?:Nach |)(?:[Dd]en |%reApproximate |)(\d+) %reUnit",NORM_VALUE="Pgroup(2)%normUnit4Duration(group(3))"
+RULENAME="duration_r1a2",EXTRACTION="(?:Nach |)(?:[Dd]en |%reApproximate |)%(reNumWord1D|reNumWord2D) ([Ss]tunden?|[Mm]inuten?|[Ss]ekunden?)",NORM_VALUE="PT%normDurationNumber(group(2))%normUnit4Duration(group(3))"
+RULENAME="duration_r1b2",EXTRACTION="(?:Nach |)(?:[Dd]en |%reApproximate |)(\d+) ([Ss]tunden?|[Mm]inuten?|[Ss]ekunden?)",NORM_VALUE="PTgroup(2)%normUnit4Duration(group(3))"
+RULENAME="duration_r1c1",EXTRACTION="(?:Nach |)(?:[Dd]en |%reApproximate |)%(reNumWord1D|reNumWord2D)%reAndOrTo%(reNumWord1D|reNumWord2D) %reUnit",NORM_VALUE="P%normDurationNumber(group(2))%normUnit4Duration(group(5))",OFFSET="group(0)-group(2)"
+RULENAME="duration_r1d1",EXTRACTION="(?:Nach |)(?:[Dd]en |%reApproximate |)(\d+)%reAndOrTo(\d+) %reUnit",NORM_VALUE="Pgroup(2)%normUnit4Duration(group(5))",OFFSET="group(0)-group(2)"
+RULENAME="duration_r1c2",EXTRACTION="(?:Nach |)(?:[Dd]en |%reApproximate |)%(reNumWord1D|reNumWord2D)%reAndOrTo%(reNumWord1D|reNumWord2D) ([Ss]tunden?|[Mm]inuten?|[Ss]ekunden?)",NORM_VALUE="PT%normDurationNumber(group(2))%normUnit4Duration(group(5))",OFFSET="group(0)-group(2)"
+RULENAME="duration_r1d2",EXTRACTION="(?:Nach |)(?:[Dd]en |%reApproximate |)(\d+)%reAndOrTo(\d+) ([Ss]tunden?|[Mm]inuten?|[Ss]ekunden?)",NORM_VALUE="TPgroup(2)%normUnit4Duration(group(5))",OFFSET="group(0)-group(2)"
// duration_r2
// EXAMPLE r2a_1: die nächsten zwanzig Tage
@@ -28,28 +28,30 @@ RULENAME="duration_r1d2",EXTRACTION="(%reApproximate )?([\d]+)%reAndOrTo([\d]+)
// EXAMPLE r2d_1: die nächsten zwanzig Minuten
// EXAMPLE r2e_1: die nächsten 20 Minuten
// EXAMPLE r2f_1: die nächsten paar Minuten
-RULENAME="duration_r2a",EXTRACTION="(%reApproximate )?die %reThisNextLast (%reNumWord2D|%reNumWord1D)( |-)%reUnit( oder so)?",NORM_VALUE="P%normDurationNumber(group(4))%normUnit4Duration(group(8))"
-RULENAME="duration_r2b",EXTRACTION="(%reApproximate )?die %reThisNextLast ([\d]+)( |-)%reUnit( oder so)?",NORM_VALUE="Pgroup(4)%normUnit4Duration(group(6))"
-RULENAME="duration_r2c",EXTRACTION="(%reApproximate )?die %reThisNextLast (paar|wenigen?|einigen?) %reUnit( oder so)?",NORM_VALUE="PX%normUnit4Duration(group(5))"
-RULENAME="duration_r2d",EXTRACTION="(%reApproximate )?die %reThisNextLast (%reNumWord2D|%reNumWord1D)( |-)(Minuten?|Stunden?)( oder so)?",NORM_VALUE="PT%normDurationNumber(group(4))%normUnit4Duration(group(8))"
-RULENAME="duration_r2e",EXTRACTION="(%reApproximate )?die %reThisNextLast ([\d]+)( |-)(Minuten?|Stunden?)( oder so)?",NORM_VALUE="PTgroup(4)%normUnit4Duration(group(6))"
-RULENAME="duration_r2f",EXTRACTION="(%reApproximate )?die %reThisNextLast (paar|wenigen?|einigen?) (Minuten?|Stunden?)( oder so)?",NORM_VALUE="PTX%normUnit4Duration(group(5))"
+RULENAME="duration_r2a",EXTRACTION="(?:%reApproximate |)die %reThisNextLast %(reNumWord1D|reNumWord2D)[ -]%reUnit(?: oder so)?",NORM_VALUE="P%normDurationNumber(group(3))%normUnit4Duration(group(4))"
+RULENAME="duration_r2b",EXTRACTION="(?:%reApproximate |)die %reThisNextLast (\d+)[ -]%reUnit(?: oder so)?",NORM_VALUE="Pgroup(3)%normUnit4Duration(group(4))"
+RULENAME="duration_r2c",EXTRACTION="(?:%reApproximate |)die %reThisNextLast (?:paar|wenigen?|einigen?) %reUnit(?: oder so)?",NORM_VALUE="PX%normUnit4Duration(group(3))"
+// Note: PTX, not PT; subtle difference
+RULENAME="duration_r2d",EXTRACTION="(?:%reApproximate |)die %reThisNextLast %(reNumWord1D|reNumWord2D)[ -]([Ss]tunden?|[Mm]inuten?|[Ss]ekunden?)(?: oder so)?",NORM_VALUE="PT%normDurationNumber(group(3))%normUnit4Duration(group(4))"
+RULENAME="duration_r2e",EXTRACTION="(?:%reApproximate |)die %reThisNextLast (\d+)[ -]([Ss]tunden?|[Mm]inuten?|[Ss]ekunden?)(?: oder so)?",NORM_VALUE="PTgroup(3)%normUnit4Duration(group(4))"
+RULENAME="duration_r2f",EXTRACTION="(?:%reApproximate |)die %reThisNextLast (?:paar|wenigen?|einigen?) ([Ss]tunden?|[Mm]inuten?|[Ss]ekunden?)(?: oder so)?",NORM_VALUE="PTX%normUnit4Duration(group(3))"
// duration_r3
// EXAMPLE r3a_1: ein Jahr
-// EXAMPLE r3b_1: eine Stunde
+// EXAMPLE r3a_1: eine Stunde
// EXAMPLE r3c_1: 20-tägig
// EXAMPLE r3d_1: 20-stündig
-RULENAME="duration_r3a",EXTRACTION="ein(|e|es|em|er|en)( |-)%reUnit",NORM_VALUE="P1%normUnit4Duration(group(3))"
-RULENAME="duration_r3b",EXTRACTION="einer?( |-)(Minuten?|Stunden?)",NORM_VALUE="PT1%normUnit4Duration(group(2))"
-RULENAME="duration_r3c",EXTRACTION="([\d]+)( |-)tägige?[ns]?",NORM_VALUE="PTgroup(1)D"
-RULENAME="duration_r3d",EXTRACTION="([\d]+)( |-)stündige?[ns]?",NORM_VALUE="PTgroup(1)D"
+RULENAME="duration_r3a",EXTRACTION="ein(?:e|es|em|er|en|)[ -]%reUnit",NORM_VALUE="P1%normUnit4Duration(group(1))"
+RULENAME="duration_r3c",EXTRACTION="(\d+)[ -]tägige?[ns]?",NORM_VALUE="Pgroup(1)D"
+RULENAME="duration_r3d",EXTRACTION="(\d+)[ -]stündige?[ns]?",NORM_VALUE="PTgroup(1)H"
+RULENAME="duration_r3e",EXTRACTION="(\d+)[ -]minütige?[ns]?",NORM_VALUE="PTgroup(1)M"
+RULENAME="duration_r3f",EXTRACTION="(\d+)[ -]sekündige?[ns]?",NORM_VALUE="PTgroup(1)S"
// reUnitPlural
-RULENAME="duration_r4a",EXTRACTION="[Dd](ie|en) ([\S]+) %reUnitPlural",NORM_VALUE="PX%normUnit4Duration(group(3))","POS_CONSTRAINT="group(2):ADJA:"
-RULENAME="duration_r4b",EXTRACTION="[Dd](ie|en) %reUnitPlural",NORM_VALUE="PX%normUnit4Duration(group(2))"
-RULENAME="duration_r4c",EXTRACTION="([Ee]inigen?|[Ww]enigen?|[Vv]ielen?) %reUnitPlural",NORM_VALUE="PX%normUnit4Duration(group(2))"
-RULENAME="duration_r4d",EXTRACTION="([Ss]eit) %reUnitPlural",NORM_VALUE="PX%normUnit4Duration(group(2))",OFFSET="group(2)-group(2)"
+RULENAME="duration_r4a",EXTRACTION="[Dd](?:ie|en) (\S+) %reUnitPlural",NORM_VALUE="PX%normUnit4Duration(group(2))",POS_CONSTRAINT="group(2):ADJA:"
+RULENAME="duration_r4b",EXTRACTION="[Dd](?:ie|en) %reUnitPlural",NORM_VALUE="PX%normUnit4Duration(group(1))"
+RULENAME="duration_r4c",EXTRACTION="(?:[Ee]inigen?|[Ww]enigen?|[Vv]ielen?) %reUnitPlural",NORM_VALUE="PX%normUnit4Duration(group(1))"
+RULENAME="duration_r4d",EXTRACTION="(?:[Ss]eit) %reUnitPlural",NORM_VALUE="PX%normUnit4Duration(group(1))",OFFSET="group(1)-group(1)"
////////////////////
// NEGATIVE RULES //
@@ -58,7 +60,7 @@ RULENAME="duration_r4d",EXTRACTION="([Ss]eit) %reUnitPlural",NORM_VALUE="PX%norm
// EXAMPLE r1a_negation_1: zwanzig Jahre alt
// EXAMPLE r1b_negation_1: 20 Jahre alt
// EXAMPLE r1c_negation_1: einige Jahre alt
-RULENAME="duration_r1a_negation",EXTRACTION="(%reApproximate )?(%reNumWord2D|%reNumWord1D) (%reUnit|Minuten?|Stunden?) (älter|jünger|alt|jung)",NORM_VALUE="REMOVE"
-RULENAME="duration_r1b_negation",EXTRACTION="(%reApproximate )?([\d]+) (%reUnit|Minuten?|Stunden?) (älter|jünger|alt|jung)",NORM_VALUE="REMOVE"
-RULENAME="duration_r1c_negation",EXTRACTION="(%reApproximate )?(einige) %reUnit (älter|jünger|alt|jung)",NORM_VALUE="REMOVE"
+RULENAME="duration_r1a_negation",EXTRACTION="(?:%reApproximate |)%(reNumWord1D|reNumWord2D) %reUnitFine (?:älter|jünger|alt|jung)",NORM_VALUE="REMOVE"
+RULENAME="duration_r1b_negation",EXTRACTION="(?:%reApproximate |)\d+ %reUnitFine (?:älter|jünger|alt|jung)",NORM_VALUE="REMOVE"
+RULENAME="duration_r1c_negation",EXTRACTION="(?:%reApproximate |)einige %reUnit (?:älter|jünger|alt|jung)",NORM_VALUE="REMOVE"
diff --git a/resources/german/rules/resources_rules_intervalrules.txt b/resources/german/rules/resources_rules_intervalrules.txt
index 59e67a15..d0fb6a8c 100644
--- a/resources/german/rules/resources_rules_intervalrules.txt
+++ b/resources/german/rules/resources_rules_intervalrules.txt
@@ -16,4 +16,4 @@ RULENAME="interval_02",EXTRACTION="(?:[zZ]wischen|[wW]ährend) un
RULENAME="interval_03",EXTRACTION="(?:[vV]on)?(?: )?-(?: )?",NORM_VALUE="group(1)-group(2)"
RULENAME="interval_04",EXTRACTION=" bis ",NORM_VALUE="group(1)-group(2)"
RULENAME="interval_05",EXTRACTION="begann (?:in|im|am) (?:,)? und endete (?:in|im|am) ",NORM_VALUE="group(1)-group(2)"
-RULENAME="interval_06",EXTRACTION="/,",NORM_VALUE="group(1)-group(2)"
\ No newline at end of file
+RULENAME="interval_06",EXTRACTION="/,",NORM_VALUE="group(1)-group(2)"
diff --git a/resources/german/rules/resources_rules_setrules.txt b/resources/german/rules/resources_rules_setrules.txt
index 43b09df6..72e4a9dd 100644
--- a/resources/german/rules/resources_rules_setrules.txt
+++ b/resources/german/rules/resources_rules_setrules.txt
@@ -12,10 +12,10 @@
// EXAMPLE 2: every Monday
// EXAMPLE 3: each September
// EXAMPLE 4: every summer
-RULENAME="set_r1a",EXTRACTION="(jede[nrs]?) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(2)),0,1))",NORM_QUANT="EVERY"
-RULENAME="set_r1b",EXTRACTION="(jede[nrs]?) %reWeekday",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(2))",NORM_QUANT="EVERY",NORM_FREQ="1W"
-RULENAME="set_r1c",EXTRACTION="(jede[nrs]?) (%reMonthLong|%reMonthShort)",NORM_VALUE="XXXX-%normMonth(group(2))",NORM_QUANT="EVERY",NORM_FREQ="1M"
-RULENAME="set_r1d",EXTRACTION="(jede[nrs]?) %reSeason",NORM_VALUE="XXXX-%normSeason(group(2))",NORM_QUANT="EVERY",NORM_FREQ="1S"
+RULENAME="set_r1a",EXTRACTION="(?:jede[nrs]?) %reUnit",NORM_VALUE="P1%UPPERCASE%(%SUBSTRING%(%normUnit(group(1)),0,1))",NORM_QUANT="EVERY"
+RULENAME="set_r1b",EXTRACTION="(?:jede[nrs]?) %reWeekday",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(1))",NORM_QUANT="EVERY",NORM_FREQ="1W"
+RULENAME="set_r1c",EXTRACTION="(?:jede[nrs]?) %(reMonthLong|reMonthShort)",NORM_VALUE="XXXX-%normMonth(group(1))",NORM_QUANT="EVERY",NORM_FREQ="1M"
+RULENAME="set_r1d",EXTRACTION="(?:jede[nrs]?) %reSeason",NORM_VALUE="XXXX-%normSeason(group(1))",NORM_QUANT="EVERY",NORM_FREQ="1S"
// set_r2
// EXAMPLE r2a-1: jährlich
@@ -27,6 +27,6 @@ RULENAME="set_r2a",EXTRACTION="%reSetWords",NORM_VALUE="%normSetWords(group(1))"
// set_r3
// EXAMPLE r3a_1: Montag vormittags
// EXAMPLE r3a_1: Montag und Samstag nachts (find: Montag nachts)
-RULENAME="set_r3a",EXTRACTION="%reWeekday[ ]?%rePartOfDay[s]",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(1))T%normPartOfDay(group(2))",NORM_FREQ="1W"
-RULENAME="set_r3b",EXTRACTION="%reWeekday (und|oder) %reWeekday %rePartOfDay[s]",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(1))T%normPartOfDay(group(4))",NORM_FREQ="1W",OFFSET="group(1)-group(1)"
+RULENAME="set_r3a",EXTRACTION="%reWeekday ?%rePartOfDay[s]",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(1))T%normPartOfDay(group(2))",NORM_FREQ="1W"
+RULENAME="set_r3b",EXTRACTION="%reWeekday (?:und|oder) %reWeekday %rePartOfDay[s]",NORM_VALUE="XXXX-WXX-%normDayInWeek(group(1))T%normPartOfDay(group(3))",NORM_FREQ="1W",OFFSET="group(1)-group(1)"
diff --git a/resources/german/rules/resources_rules_timerules.txt b/resources/german/rules/resources_rules_timerules.txt
index 9ab47740..a5253096 100644
--- a/resources/german/rules/resources_rules_timerules.txt
+++ b/resources/german/rules/resources_rules_timerules.txt
@@ -15,11 +15,11 @@
// EXAMPLE r1c-1: 12/29/2000 20:29
// EXAMPLE r1d-1: 12/29/2000 20:29:29
// EXAMPLE r1e-1: 12/29/2000 20:29:29.79
-RULENAME="time_r1a",EXTRACTION="(%reYear4Digit-%reMonthNumber-%reDayNumber)(T| )%reTimeHour:%reTimeMinute:%reTimeMinute",NORM_VALUE="group(2)-group(3)-group(4)Tgroup(6):group(7):group(8)"
-RULENAME="time_r1b",EXTRACTION="(%reYear4Digit-%reMonthNumber-%reDayNumber)(T| )%reTimeHour:%reTimeMinute",NORM_VALUE="group(2)-group(3)-group(4)Tgroup(6):group(7)"
-RULENAME="time_r1c",EXTRACTION="(%reMonthNumber/%reDayNumber/%reYear4Digit) %reTimeHour:%reTimeMinute",NORM_VALUE="group(4)-group(2)-group(3)Tgroup(5):group(6)"
-RULENAME="time_r1d",EXTRACTION="(%reMonthNumber/%reDayNumber/%reYear4Digit) %reTimeHour:%reTimeMinute:%reTimeMinute",NORM_VALUE="group(4)-group(2)-group(3)Tgroup(5):group(6):group(7)"
-RULENAME="time_r1e",EXTRACTION="(%reMonthNumber/%reDayNumber/%reYear4Digit) %reTimeHour:%reTimeMinute:%reTimeMinute\.%reYear2Digit",NORM_VALUE="group(4)-group(2)-group(3)Tgroup(5):group(6):group(7).group(8)"
+RULENAME="time_r1a",EXTRACTION="%reYear4Digit-%reMonthNumber-%reDayNumber[T ]%reTimeHour:%reTimeMinute:%reTimeMinute",NORM_VALUE="group(1)-group(2)-group(3)Tgroup(4):group(5):group(6)"
+RULENAME="time_r1b",EXTRACTION="%reYear4Digit-%reMonthNumber-%reDayNumber[T ]%reTimeHour:%reTimeMinute",NORM_VALUE="group(1)-group(2)-group(3)Tgroup(4):group(5)"
+RULENAME="time_r1c",EXTRACTION="%reMonthNumber/%reDayNumber/%reYear4Digit %reTimeHour:%reTimeMinute",NORM_VALUE="group(3)-group(1)-group(2)Tgroup(4):group(5)"
+RULENAME="time_r1d",EXTRACTION="%reMonthNumber/%reDayNumber/%reYear4Digit %reTimeHour:%reTimeMinute:%reTimeMinute",NORM_VALUE="group(3)-group(1)-group(2)Tgroup(4):group(5):group(6)"
+RULENAME="time_r1e",EXTRACTION="%reMonthNumber/%reDayNumber/%reYear4Digit %reTimeHour:%reTimeMinute:%reTimeMinute\.(\d\d)",NORM_VALUE="group(3)-group(1)-group(2)Tgroup(4):group(5):group(6).group(7)"
/////////////////////////////
// PART-OF-DAY GRANULARITY //
@@ -29,11 +29,10 @@ RULENAME="time_r1e",EXTRACTION="(%reMonthNumber/%reDayNumber/%reYear4Digit) %reT
// EXAMPLE r2b_1: Monday night
// EXAMPLE r2c_1: midnight today
// EXAMPLE r2d_1: yesterday morning
-RULENAME="time_r2a",EXTRACTION="%rePartOfDay %reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(2))T%normPartOfDay(group(1))"
-RULENAME="time_r2b1",EXTRACTION="%reWeekday %rePartOfDay",NORM_VALUE="UNDEF-day-%normWeekday(group(1))T%normPartOfDay(group(2))"
-RULENAME="time_r2b2",EXTRACTION="%reWeekday%rePartOfDay",NORM_VALUE="UNDEF-day-%normWeekday(group(1))T%normPartOfDay(group(2))"
-RULENAME="time_r2c",EXTRACTION="%rePartOfDay %reDateWord",NORM_VALUE="%normDateWord(group(2))T%normPartOfDay(group(1))"
-RULENAME="time_r2d",EXTRACTION="%reDateWord %rePartOfDay",NORM_VALUE="%normDateWord(group(1))T%normPartOfDay(group(2))"
+RULENAME="time_r2a",EXTRACTION="%rePartOfDay (?:des )?%reWeekday",NORM_VALUE="UNDEF-day-%normWeekday(group(2))T%normPartOfDay(group(1))"
+RULENAME="time_r2b",EXTRACTION="%reWeekday(?:des | |)%rePartOfDay",NORM_VALUE="UNDEF-day-%normWeekday(group(1))T%normPartOfDay(group(2))"
+RULENAME="time_r2c",EXTRACTION="%rePartOfDay (?:des )?%reDateWord",NORM_VALUE="%normDateWord(group(2))T%normPartOfDay(group(1))"
+RULENAME="time_r2d",EXTRACTION="%reDateWord (?:des )?%rePartOfDay",NORM_VALUE="%normDateWord(group(1))T%normPartOfDay(group(2))"
///////////////////////////
// TIMEPOINT GRANULARITY //
@@ -42,28 +41,28 @@ RULENAME="time_r2d",EXTRACTION="%reDateWord %rePartOfDay",NORM_VALUE="%normDateW
// EXAMPLE r3a_1: 14:30 Uhr
// EXAMPLE r3b_1: 14 Uhr 30
// EXAMPLE r3c_1: 15 Uhr
-RULENAME="time_r3a",EXTRACTION="(%reApproximate )?%reTimeHour:%reTimeMinute( Uhr)?",NORM_VALUE="UNDEF-this-dayT%normDay(group(3)):group(4)"
-RULENAME="time_r3b",EXTRACTION="(%reApproximate )?%reTimeHour (Uhr) %reTimeMinute",NORM_VALUE="UNDEF-this-dayT%normDay(group(3)):group(5)"
-RULENAME="time_r3c",EXTRACTION="(%reApproximate )?%reTimeHour Uhr",NORM_VALUE="UNDEF-this-dayT%normDay(group(3)):00"
+RULENAME="time_r3a",EXTRACTION="(?:%reApproximate |)%reTimeHour:%reTimeMinute(?: Uhr)?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):group(3)"
+RULENAME="time_r3b",EXTRACTION="(?:%reApproximate |)%reTimeHour (?:Uhr) %reTimeMinute",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):group(3)"
+RULENAME="time_r3c",EXTRACTION="(?:%reApproximate |)%reTimeHour Uhr",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):00"
-RULENAME="time_r3d1",EXTRACTION="(%reApproximate )?%reTimeHourWord Uhr( morgens)?",NORM_VALUE="UNDEF-this-dayT%normDay(group(3)):00"
-RULENAME="time_r3d2",EXTRACTION="(%reApproximate )?%reTimeHourWord Uhr (abends|nachmittags)",NORM_VALUE="UNDEF-this-dayT%SUM%(%normDay(group(3)),12):00"
-RULENAME="time_r3d3",EXTRACTION="(%reApproximate )?%reTimeHourWordAll Uhr",NORM_VALUE="UNDEF-this-dayT%normDay(group(3)):00"
+RULENAME="time_r3d1",EXTRACTION="(?:%reApproximate |)%reTimeHourWord Uhr(?: morgens)?",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):00"
+RULENAME="time_r3d2",EXTRACTION="(?:%reApproximate |)%reTimeHourWord Uhr (?:abends|nachmittags)",NORM_VALUE="UNDEF-REF-day-PLUS-0T%SUM%(%normDay(group(2)),12):00"
+RULENAME="time_r3d3",EXTRACTION="(?:%reApproximate |)%reTimeHourWordAll Uhr",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normDay(group(2)):00"
// time_r4
// EXAMPLE r4a_1: Morgen des 1. August 2000
// EXAMPLE r4b_1: Morgen des 1. August
-RULENAME="time_r4a",EXTRACTION="(%reApproximate )?%rePartOfDay (des) (%reDayNumber)([\.]? |[\.])(%reMonthLong|%reMonthShort|%reMonthNumber[\.]?),? %reYear4Digit",NORM_VALUE="group(12)-%normMonth(group(8))-%normDay(group(5))T%normPartOfDay(group(3))"
-RULENAME="time_r4b",EXTRACTION="(%reApproximate )?%rePartOfDay (des) (%reDayNumber)([\.]? |[\.])(%reMonthLong|%reMonthShort|%reMonthNumber[\.]?)",NORM_VALUE="UNDEF-year-%normMonth(group(8))-%normDay(group(5))T%normPartOfDay(group(3))"
+RULENAME="time_r4a",EXTRACTION="(?:%reApproximate |)%rePartOfDay des %(reDayNumber|reDayNumberTh|reDayWordTh) %(reMonthLong|reMonthShort|reMonthNumber)\.?,? %reYear4Digit",NORM_VALUE="group(5)-%normMonth(group(4))-%normDay(group(3))T%normPartOfDay(group(2))"
+RULENAME="time_r4b",EXTRACTION="(?:%reApproximate |)%rePartOfDay des %(reDayNumber|reDayNumberTh|reDayWordTh) %(reMonthLong|reMonthShort|reMonthNumber)",NORM_VALUE="UNDEF-year-%normMonth(group(4))-%normDay(group(3))T%normPartOfDay(group(2))"
// time_r5
// EXAMPLE r5a-1: (am) Morgen
// EXAMPLE r5b-1: nächsten Morgen
// EXAMPLE r5c-1: (am) Morgen desselben Tages
-RULENAME="time_r5a",EXTRACTION="(\b[Aa]m) %rePartOfDay",NORM_VALUE="UNDEF-this-dayT%normPartOfDay(group(2))",OFFSET="group(2)-group(2)"
-RULENAME="time_r5b",EXTRACTION="%reThisNextLast %rePartOfDay",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-dayT%normPartOfDay(group(2))"
-RULENAME="time_r5c",EXTRACTION="(de[rm]selben?) %rePartOfDay",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normPartOfDay(group(2))"
-RULENAME="time_r5d",EXTRACTION="%rePartOfDay (desselben|dieses) (Tages)",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normPartOfDay(group(1))"
+RULENAME="time_r5a",EXTRACTION="(?:[Aa]m) %rePartOfDay",NORM_VALUE="UNDEF-this-dayT%normPartOfDay(group(1))",OFFSET="group(1)-group(1)"
+RULENAME="time_r5b",EXTRACTION="(?:[Aa]m) %reThisNextLast %rePartOfDay",NORM_VALUE="UNDEF-%normThisNextLast(group(1))-dayT%normPartOfDay(group(2))"
+RULENAME="time_r5c",EXTRACTION="(?:de[rm]selben?) %rePartOfDay",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normPartOfDay(group(1))"
+RULENAME="time_r5d",EXTRACTION="(?:[Aa]m) %rePartOfDay (?:desselben|dieses) Tages",NORM_VALUE="UNDEF-REF-day-PLUS-0T%normPartOfDay(group(1))"
diff --git a/src/de/unihd/dbs/heideltime/standalone/CLISwitch.java b/src/de/unihd/dbs/heideltime/standalone/CLISwitch.java
index 145482be..98baa1de 100644
--- a/src/de/unihd/dbs/heideltime/standalone/CLISwitch.java
+++ b/src/de/unihd/dbs/heideltime/standalone/CLISwitch.java
@@ -6,6 +6,7 @@
import java.util.Date;
+import de.unihd.dbs.uima.annotator.heideltime.DocumentType;
import de.unihd.dbs.uima.annotator.heideltime.resources.Language;
/**
@@ -18,7 +19,7 @@ public enum CLISwitch {
ENCODING ("Encoding to use", "-e", "UTF-8"),
OUTPUTTYPE ("Output Type output type to use", "-o", OutputType.TIMEML),
LANGUAGE ("Language to use", "-l", Language.ENGLISH.toString()),
- DOCTYPE ("Document Type/Domain to use", "-t", DocumentType.NARRATIVES),
+ DOCTYPE ("Document Type/Domain to use", "-t", DocumentType.NARRATIVE),
DCT ("Document Creation Time. Format: YYYY-mm-dd.", "-dct", new Date()),
CONFIGFILE ("Configuration file path", "-c", "config.props"),
LOCALE ("Locale", "-locale", null),
diff --git a/src/de/unihd/dbs/heideltime/standalone/DocumentType.java b/src/de/unihd/dbs/heideltime/standalone/DocumentType.java
index 4ca4baad..91ad03e5 100644
--- a/src/de/unihd/dbs/heideltime/standalone/DocumentType.java
+++ b/src/de/unihd/dbs/heideltime/standalone/DocumentType.java
@@ -1,44 +1,28 @@
-/*
- * DocumentType.java
- *
- * Copyright (c) 2011, Database Research Group, Institute of Computer Science, University of Heidelberg.
- * All rights reserved. This program and the accompanying materials
- * are made available under the terms of the GNU General Public License.
- *
- * authors: Andreas Fay, Jannik Strötgen
- * email: fay@stud.uni-heidelberg.de, stroetgen@uni-hd.de
- *
- * HeidelTime is a multilingual, cross-domain temporal tagger.
- * For details, see http://dbs.ifi.uni-heidelberg.de/heideltime
- */
-
-package de.unihd.dbs.heideltime.standalone;
-
-/**
- * Type of document to be processed by HeidelTime
- *
- * @author Andreas Fay, University of Heidelberg
- * @version 1.0
- */
-public enum DocumentType {
- NARRATIVES {
- public String toString() {
- return "narratives";
- }
- },
- NEWS {
- public String toString() {
- return "news";
- }
- },
- COLLOQUIAL {
- public String toString() {
- return "colloquial";
- }
- },
- SCIENTIFIC {
- public String toString() {
- return "scientific";
- }
- }
-}
+package de.unihd.dbs.heideltime.standalone;
+
+/**
+ * Legacy constants to transition to the enum at {@link de.unihd.dbs.uima.annotator.heideltime.DocumentType}.
+ *
+ * Because we cannot subclass enums, this will not be binary compatible,
+ * but at least we get compile time compatibility.
+ *
+ * @author Erich Schubert
+ */
+@Deprecated
+public final class DocumentType {
+ /** Use {@link de.unihd.dbs.uima.annotator.heideltime.DocumentType.NARRATIVE} instead. */
+ @Deprecated
+ public static final de.unihd.dbs.uima.annotator.heideltime.DocumentType NARRATIVES = de.unihd.dbs.uima.annotator.heideltime.DocumentType.NARRATIVE;
+
+ /** Use {@link de.unihd.dbs.uima.annotator.heideltime.DocumentType.NEWS} instead. */
+ @Deprecated
+ public static final de.unihd.dbs.uima.annotator.heideltime.DocumentType NEWS = de.unihd.dbs.uima.annotator.heideltime.DocumentType.NEWS;
+
+ /** Use {@link de.unihd.dbs.uima.annotator.heideltime.DocumentType.COLLOQUIAL} instead. */
+ @Deprecated
+ public static final de.unihd.dbs.uima.annotator.heideltime.DocumentType COLLOQUIAL = de.unihd.dbs.uima.annotator.heideltime.DocumentType.COLLOQUIAL;
+
+ /** Use {@link de.unihd.dbs.uima.annotator.heideltime.DocumentType.SCIENTIFIC} instead. */
+ @Deprecated
+ public static final de.unihd.dbs.uima.annotator.heideltime.DocumentType SCIENTIFIC = de.unihd.dbs.uima.annotator.heideltime.DocumentType.SCIENTIFIC;
+}
diff --git a/src/de/unihd/dbs/heideltime/standalone/HeidelTimeStandalone.java b/src/de/unihd/dbs/heideltime/standalone/HeidelTimeStandalone.java
index 79d7043f..b9947267 100644
--- a/src/de/unihd/dbs/heideltime/standalone/HeidelTimeStandalone.java
+++ b/src/de/unihd/dbs/heideltime/standalone/HeidelTimeStandalone.java
@@ -29,13 +29,13 @@
import java.util.Date;
import java.util.Locale;
import java.util.Properties;
-import java.util.logging.Level;
-import java.util.logging.Logger;
import org.apache.uima.UIMAFramework;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.util.XMLInputSource;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import de.unihd.dbs.heideltime.standalone.components.JCasFactory;
import de.unihd.dbs.heideltime.standalone.components.ResultFormatter;
@@ -51,6 +51,7 @@
import de.unihd.dbs.heideltime.standalone.components.impl.UimaContextImpl;
import de.unihd.dbs.heideltime.standalone.components.impl.XMIResultFormatter;
import de.unihd.dbs.heideltime.standalone.exceptions.DocumentCreationTimeMissingException;
+import de.unihd.dbs.uima.annotator.heideltime.DocumentType;
import de.unihd.dbs.uima.annotator.heideltime.HeidelTime;
import de.unihd.dbs.uima.annotator.heideltime.resources.Language;
import de.unihd.dbs.uima.annotator.heideltime.resources.ResourceScanner;
@@ -65,6 +66,11 @@
*/
public class HeidelTimeStandalone {
+ /**
+ * Logging engine
+ */
+ private static final Logger LOG = LoggerFactory.getLogger(HeidelTimeStandalone.class);
+
/**
* Used document type
*/
@@ -100,12 +106,6 @@ public class HeidelTimeStandalone {
*/
private Boolean doIntervalTagging;
- /**
- * Logging engine
- */
- private static Logger logger = Logger.getLogger("HeidelTimeStandalone");
-
-
/**
* empty constructor.
*
@@ -215,7 +215,7 @@ public void initialize(Language language, DocumentType typeToProcess, OutputType
* @param doIntervalTagging Whether or not to invoke the IntervalTagger
*/
public void initialize(Language language, DocumentType typeToProcess, OutputType outputType, String configPath, POSTagger posTagger, Boolean doIntervalTagging) {
- logger.log(Level.INFO, "HeidelTimeStandalone initialized with language " + this.language.getName());
+ LOG.info("HeidelTimeStandalone initialized with language {}", language.getName());
// set the POS tagger
this.posTagger = posTagger;
@@ -234,14 +234,13 @@ public void initialize(Language language, DocumentType typeToProcess, OutputType
try {
heidelTime = new HeidelTime();
heidelTime.initialize(new UimaContextImpl(language, typeToProcess, CLISwitch.VERBOSITY2.getIsActive()));
- logger.log(Level.INFO, "HeidelTime initialized");
+ LOG.info("HeidelTime initialized");
} catch (Exception e) {
- e.printStackTrace();
- logger.log(Level.WARNING, "HeidelTime could not be initialized");
+ LOG.warn("HeidelTime could not be initialized", e);
}
// Initialize JCas factory -------------
- logger.log(Level.FINE, "Initializing JCas factory...");
+ LOG.debug("Initializing JCas factory...");
try {
TypeSystemDescription[] descriptions = new TypeSystemDescription[] {
UIMAFramework
@@ -253,10 +252,9 @@ public void initialize(Language language, DocumentType typeToProcess, OutputType
.getResource(
Config.get(Config.TYPESYSTEMHOME)))) };
jcasFactory = new JCasFactoryImpl(descriptions);
- logger.log(Level.INFO, "JCas factory initialized");
+ LOG.info("JCas factory initialized");
} catch (Exception e) {
- e.printStackTrace();
- logger.log(Level.WARNING, "JCas factory could not be initialized");
+ LOG.warn("JCas factory could not be initialized", e);
}
}
@@ -265,7 +263,7 @@ public void initialize(Language language, DocumentType typeToProcess, OutputType
* @param jcas jcas object
*/
private void runIntervalTagger(JCas jcas) {
- logger.log(Level.FINEST, "Running Interval Tagger...");
+ LOG.debug("Running Interval Tagger...");
Integer beforeAnnotations = jcas.getAnnotationIndex().size();
// Prepare the options for IntervalTagger's execution
@@ -280,8 +278,8 @@ private void runIntervalTagger(JCas jcas) {
iTagger.process(jcas);
// debug output
- Integer afterAnnotations = jcas.getAnnotationIndex().size();
- logger.log(Level.FINEST, "Annotation delta: " + (afterAnnotations - beforeAnnotations));
+ int afterAnnotations = jcas.getAnnotationIndex().size();
+ LOG.debug("Annotation delta: {}", afterAnnotations - beforeAnnotations);
}
/**
@@ -338,7 +336,7 @@ private void establishHeidelTimePreconditions(JCas jcas) {
* @param jcas
*/
private void establishPartOfSpeechInformation(JCas jcas) {
- logger.log(Level.FINEST, "Establishing part of speech information...");
+ LOG.debug("Establishing part of speech information...");
PartOfSpeechTagger partOfSpeechTagger = null;
Properties settings = new Properties();
@@ -346,7 +344,7 @@ private void establishPartOfSpeechInformation(JCas jcas) {
case ARABIC:
if(POSTagger.NO.equals(posTagger)) {
partOfSpeechTagger = new AllLanguagesTokenizerWrapper();
- logger.log(Level.INFO, "Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for Arabic. "
+ LOG.info("Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for Arabic. "
+ "Thus, tagging results might be very different (and worse).");
} else {
partOfSpeechTagger = new StanfordPOSTaggerWrapper();
@@ -360,7 +358,7 @@ private void establishPartOfSpeechInformation(JCas jcas) {
case VIETNAMESE:
if(POSTagger.NO.equals(posTagger)) {
partOfSpeechTagger = new AllLanguagesTokenizerWrapper();
- logger.log(Level.INFO, "Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for Vietnamese. "
+ LOG.info("Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for Vietnamese. "
+ "Thus, tagging results might be very different (and worse).");
} else {
partOfSpeechTagger = new JVnTextProWrapper();
@@ -375,7 +373,7 @@ private void establishPartOfSpeechInformation(JCas jcas) {
case CROATIAN:
if(POSTagger.NO.equals(posTagger)) {
partOfSpeechTagger = new AllLanguagesTokenizerWrapper();
- logger.log(Level.INFO, "Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for Croatian. "
+ LOG.info("Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for Croatian. "
+ "Thus, tagging results might be very different (and worse).");
} else {
partOfSpeechTagger = new HunPosTaggerWrapper();
@@ -411,18 +409,18 @@ private void establishPartOfSpeechInformation(JCas jcas) {
settings.put(PartOfSpeechTagger.HUNPOS_MODEL_PATH, Config.get(Config.HUNPOS_MODEL_PATH));
} else if(POSTagger.NO.equals(posTagger)) {
partOfSpeechTagger = new AllLanguagesTokenizerWrapper();
- logger.log(Level.INFO, "Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for the selected language. "
+ LOG.info("Be aware that you use the AllLanguagesTokenizer instead of specific preprocessing for the selected language. "
+ "If proper preprocessing for the specified language (." + language.getName() + ") is available, this might results in better "
+ "temporal tagging quality.");
} else {
- logger.log(Level.FINEST, "Sorry, but you can't use that tagger.");
+ LOG.warn("Sorry, but you can't use that tagger.");
}
}
partOfSpeechTagger.initialize(settings);
partOfSpeechTagger.process(jcas);
partOfSpeechTagger.reset();
- logger.log(Level.FINEST, "Part of speech information established");
+ LOG.trace("Part of speech information established");
}
private ResultFormatter getFormatter() {
@@ -495,33 +493,31 @@ public String process(String document, ResultFormatter resultFormatter)
*/
public String process(String document, Date documentCreationTime, ResultFormatter resultFormatter)
throws DocumentCreationTimeMissingException {
- logger.log(Level.INFO, "Processing started");
+ LOG.info("Processing started");
// Generate jcas object ----------
- logger.log(Level.FINE, "Generate CAS object");
+ LOG.debug("Generate CAS object");
JCas jcas = null;
try {
jcas = jcasFactory.createJCas();
jcas.setDocumentText(document);
- logger.log(Level.FINE, "CAS object generated");
+ LOG.debug("CAS object generated");
} catch (Exception e) {
- e.printStackTrace();
- logger.log(Level.WARNING, "Cas object could not be generated");
+ LOG.warn("Cas object could not be generated", e);
}
// Process jcas object -----------
try {
- logger.log(Level.FINER, "Establishing preconditions...");
+ LOG.trace("Establishing preconditions...");
provideDocumentCreationTime(jcas, documentCreationTime);
establishHeidelTimePreconditions(jcas);
- logger.log(Level.FINER, "Preconditions established");
+ LOG.trace("Preconditions established");
heidelTime.process(jcas);
- logger.log(Level.INFO, "Processing finished");
+ LOG.info("Processing finished");
} catch (Exception e) {
- e.printStackTrace();
- logger.log(Level.WARNING, "Processing aborted due to errors");
+ LOG.warn("Processing aborted due to errors", e);
}
// process interval tagging ---
@@ -529,15 +525,14 @@ public String process(String document, Date documentCreationTime, ResultFormatte
runIntervalTagger(jcas);
// Process results ---------------
- logger.log(Level.FINE, "Formatting result...");
+ LOG.debug("Formatting result...");
// PrintAnnotations.printAnnotations(jcas.getCas(), System.out);
String result = null;
try {
result = resultFormatter.format(jcas);
- logger.log(Level.INFO, "Result formatted");
+ LOG.info("Result formatted");
} catch (Exception e) {
- e.printStackTrace();
- logger.log(Level.WARNING, "Result could not be formatted");
+ LOG.warn("Result could not be formatted", e);
}
return result;
@@ -553,16 +548,16 @@ public static void main(String[] args) {
// get the relevant enum
CLISwitch sw = CLISwitch.getEnumFromSwitch(args[i]);
if(sw == null) { // unsupported CLI switch
- logger.log(Level.WARNING, "Unsupported switch: "+args[i]+". Quitting.");
- System.exit(-1);
+ LOG.warn("Unsupported switch: "+args[i]+". Quitting.");
+ System.exit(1);
}
if(sw.getHasFollowingValue()) { // handle values for switches
if(args.length > i+1 && !args[i+1].startsWith("-")) { // we still have an array index after this one and it's not a switch
sw.setValue(args[++i]);
} else { // value is missing or malformed
- logger.log(Level.WARNING, "Invalid or missing parameter after "+args[i]+". Quitting.");
- System.exit(-1);
+ LOG.warn("Invalid or missing parameter after "+args[i]+". Quitting.");
+ System.exit(1);
}
} else { // activate the value-less switches
sw.setValue(null);
@@ -582,43 +577,43 @@ public static void main(String[] args) {
// start off with the verbosity recognition -- lots of the other
// stuff can be skipped if this is set too high
if(CLISwitch.VERBOSITY2.getIsActive()) {
- logger.setLevel(Level.ALL);
- logger.log(Level.INFO, "Verbosity: '-vv'; Logging level set to ALL.");
+ // FIXME: not available in slf4j facade. LOG.setLevel(Level.ALL);
+ LOG.info("Verbosity: '-vv'; Logging level set to ALL.");
// output the found language resource folders
String languagesList = "";
for(String language : ResourceScanner.getInstance().getDetectedResourceFolders()) {
languagesList += System.getProperty("line.separator") + "- " + language;
}
- logger.log(Level.INFO, "Listing detected language folders:" + languagesList);
+ LOG.info("Listing detected language folders:" + languagesList);
} else if(CLISwitch.VERBOSITY.getIsActive()) {
- logger.setLevel(Level.INFO);
- logger.log(Level.INFO, "Verbosity: '-v'; Logging level set to INFO and above.");
+ // FIXME: not available in slf4j facade. LOG.setLevel(Level.INFO);
+ LOG.info("Verbosity: '-v'; Logging level set to INFO and above.");
} else {
- logger.setLevel(Level.WARNING);
- logger.log(Level.INFO, "Verbosity -v/-vv NOT FOUND OR RECOGNIZED; Logging level set to WARNING and above.");
+ // FIXME: not available in slf4j facade. LOG.setLevel(Level.WARNING);
+ LOG.info("Verbosity -v/-vv NOT FOUND OR RECOGNIZED; Logging level set to WARNING and above.");
}
// Check input encoding
String encodingType = null;
if(CLISwitch.ENCODING.getIsActive()) {
encodingType = CLISwitch.ENCODING.getValue().toString();
- logger.log(Level.INFO, "Encoding '-e': "+encodingType);
+ LOG.info("Encoding '-e': "+encodingType);
} else {
// Encoding type not found
encodingType = CLISwitch.ENCODING.getValue().toString();
- logger.log(Level.INFO, "Encoding '-e': NOT FOUND OR RECOGNIZED; set to 'UTF-8'");
+ LOG.info("Encoding '-e': NOT FOUND OR RECOGNIZED; set to 'UTF-8'");
}
// Check output format
OutputType outputType = null;
if(CLISwitch.OUTPUTTYPE.getIsActive()) {
outputType = OutputType.valueOf(CLISwitch.OUTPUTTYPE.getValue().toString().toUpperCase());
- logger.log(Level.INFO, "Output '-o': "+outputType.toString().toUpperCase());
+ LOG.info("Output '-o': "+outputType.toString().toUpperCase());
} else {
// Output type not found
outputType = (OutputType) CLISwitch.OUTPUTTYPE.getValue();
- logger.log(Level.INFO, "Output '-o': NOT FOUND OR RECOGNIZED; set to "+outputType.toString().toUpperCase());
+ LOG.info("Output '-o': NOT FOUND OR RECOGNIZED; set to "+outputType.toString().toUpperCase());
}
// Check language
@@ -627,16 +622,16 @@ public static void main(String[] args) {
language = Language.getLanguageFromString((String) CLISwitch.LANGUAGE.getValue());
if(language == Language.WILDCARD && !ResourceScanner.getInstance().getDetectedResourceFolders().contains(language.getName())) {
- logger.log(Level.SEVERE, "Language '-l': "+CLISwitch.LANGUAGE.getValue()+" NOT RECOGNIZED; aborting.");
+ LOG.error("Language '-l': {} NOT RECOGNIZED; aborting.", CLISwitch.LANGUAGE.getValue());
printHelp();
- System.exit(-1);
+ System.exit(1);
} else {
- logger.log(Level.INFO, "Language '-l': "+language.getName());
+ LOG.info("Language '-l': "+language.getName());
}
} else {
// Language not found
language = Language.getLanguageFromString((String) CLISwitch.LANGUAGE.getValue());
- logger.log(Level.INFO, "Language '-l': NOT FOUND; set to "+language.toString().toUpperCase());
+ LOG.info("Language '-l': NOT FOUND; set to {}", language.toString().toUpperCase());
}
// Check type
@@ -648,14 +643,14 @@ public static void main(String[] args) {
}
type = DocumentType.valueOf(CLISwitch.DOCTYPE.getValue().toString().toUpperCase());
} catch(IllegalArgumentException e) {
- logger.log(Level.WARNING, "Type '-t': NOT RECOGNIZED. These are the available options: " + Arrays.asList(DocumentType.values()));
- System.exit(-1);
+ LOG.warn("Type '-t': NOT RECOGNIZED. These are the available options: {}", Arrays.asList(DocumentType.values()));
+ System.exit(1);
}
- logger.log(Level.INFO, "Type '-t': "+type.toString().toUpperCase());
+ LOG.info("Type '-t': "+type.toString().toUpperCase());
} else {
// Type not found
type = (DocumentType) CLISwitch.DOCTYPE.getValue();
- logger.log(Level.INFO, "Type '-t': NOT FOUND; set to "+type.toString().toUpperCase());
+ LOG.info("Type '-t': NOT FOUND; set to {}", type.toString().toUpperCase());
}
// Check document creation time
@@ -664,21 +659,20 @@ public static void main(String[] args) {
try {
DateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
dct = formatter.parse(CLISwitch.DCT.getValue().toString());
- logger.log(Level.INFO, "Document Creation Time '-dct': "+dct.toString());
+ LOG.info("Document Creation Time '-dct': {}", dct.toString());
} catch (Exception e) {
// DCT was not parseable
- logger.log(Level.WARNING, "Document Creation Time '-dct': NOT RECOGNIZED. Quitting.");
+ LOG.warn("Document Creation Time '-dct': NOT RECOGNIZED. Quitting.");
printHelp();
- System.exit(-1);
+ System.exit(1);
}
} else {
if ((type == DocumentType.NEWS) || (type == DocumentType.COLLOQUIAL)) {
// Dct needed
dct = (Date) CLISwitch.DCT.getValue();
- logger.log(Level.INFO, "Document Creation Time '-dct': NOT FOUND; set to local date ("
- + dct.toString() + ").");
+ LOG.info("Document Creation Time '-dct': NOT FOUND; set to local date ({}).", dct.toString());
} else {
- logger.log(Level.INFO, "Document Creation Time '-dct': NOT FOUND; skipping.");
+ LOG.info("Document Creation Time '-dct': NOT FOUND; skipping.");
}
}
@@ -694,32 +688,31 @@ public static void main(String[] args) {
try {
Locale.setDefault(myLocale); // try to set the locale
- logger.log(Level.INFO, "Locale '-locale': "+myLocale.toString());
+ LOG.info("Locale '-locale': "+myLocale.toString());
} catch(Exception e) { // if the above fails, spit out error message and available locales
- logger.log(Level.WARNING, "Supplied locale parameter couldn't be resolved to a working locale. Try one of these:");
- logger.log(Level.WARNING, Arrays.asList(Locale.getAvailableLocales()).toString()); // list available locales
+ LOG.warn("Supplied locale parameter couldn't be resolved to a working locale. Try one of these:");
+ LOG.warn(Arrays.asList(Locale.getAvailableLocales()).toString()); // list available locales
printHelp();
- System.exit(-1);
+ System.exit(1);
}
} else {
// no -locale parameter supplied: just show default locale
- logger.log(Level.INFO, "Locale '-locale': NOT FOUND, set to environment locale: "+Locale.getDefault().toString());
+ LOG.info("Locale '-locale': NOT FOUND, set to environment locale: {}", Locale.getDefault().toString());
}
// Read configuration from file
String configPath = CLISwitch.CONFIGFILE.getValue().toString();
try {
- logger.log(Level.INFO, "Configuration path '-c': "+configPath);
+ LOG.info("Configuration path '-c': "+configPath);
readConfigFile(configPath);
- logger.log(Level.FINE, "Config initialized");
+ LOG.debug("Config initialized");
} catch (Exception e) {
- e.printStackTrace();
- logger.log(Level.WARNING, "Config could not be initialized! Please supply the -c switch or "
- + "put a config.props into this directory.");
+ LOG.warn("Config could not be initialized! Please supply the -c switch or "
+ + "put a config.props into this directory.", e);
printHelp();
- System.exit(-1);
+ System.exit(1);
}
// Set the preprocessing POS tagger
@@ -728,31 +721,31 @@ public static void main(String[] args) {
try {
posTagger = POSTagger.valueOf(CLISwitch.POSTAGGER.getValue().toString().toUpperCase());
} catch(IllegalArgumentException e) {
- logger.log(Level.WARNING, "Given POS Tagger doesn't exist. Please specify a valid one as listed in the help.");
+ LOG.warn("Given POS Tagger doesn't exist. Please specify a valid one as listed in the help.");
printHelp();
- System.exit(-1);
+ System.exit(1);
}
- logger.log(Level.INFO, "POS Tagger '-pos': "+posTagger.toString().toUpperCase());
+ LOG.info("POS Tagger '-pos': {}", posTagger.toString().toUpperCase());
} else {
// Type not found
posTagger = (POSTagger) CLISwitch.POSTAGGER.getValue();
- logger.log(Level.INFO, "POS Tagger '-pos': NOT FOUND OR RECOGNIZED; set to "+posTagger.toString().toUpperCase());
+ LOG.info("POS Tagger '-pos': NOT FOUND OR RECOGNIZED; set to {}", posTagger.toString().toUpperCase());
}
// Set whether or not to use the Interval Tagger
Boolean doIntervalTagging = false;
if(CLISwitch.INTERVALS.getIsActive()) {
doIntervalTagging = CLISwitch.INTERVALS.getIsActive();
- logger.log(Level.INFO, "Interval Tagger '-it': " + doIntervalTagging.toString());
+ LOG.info("Interval Tagger '-it': {}", doIntervalTagging.toString());
} else {
- logger.log(Level.INFO, "Interval Tagger '-it': NOT FOUND OR RECOGNIZED; set to " + doIntervalTagging.toString());
+ LOG.info("Interval Tagger '-it': NOT FOUND OR RECOGNIZED; set to {}", doIntervalTagging.toString());
}
// make sure we have a document path
if (docPath == null) {
- logger.log(Level.WARNING, "No input file given; aborting.");
+ LOG.warn("No input file given; aborting.");
printHelp();
- System.exit(-1);
+ System.exit(1);
}
@@ -763,7 +756,7 @@ public static void main(String[] args) {
FileChannel inChannel = null;
PrintWriter pwOut = null;
try {
- logger.log(Level.INFO, "Reading document using charset: " + encodingType);
+ LOG.info("Reading document using charset: " + encodingType);
aFile = new RandomAccessFile(docPath, "r");
inChannel = aFile.getChannel();
@@ -785,7 +778,7 @@ public static void main(String[] args) {
pwOut = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"));
pwOut.println(out);
} catch (Exception e) {
- e.printStackTrace();
+ LOG.warn(e.getMessage(), e);
} finally {
if(pwOut != null) {
pwOut.close();
@@ -809,7 +802,7 @@ public static void main(String[] args) {
public static void readConfigFile(String configPath) {
InputStream configStream = null;
try {
- logger.log(Level.INFO, "trying to read in file "+configPath);
+ LOG.info("trying to read in file "+configPath);
configStream = new FileInputStream(configPath);
Properties props = new Properties();
@@ -819,11 +812,10 @@ public static void readConfigFile(String configPath) {
configStream.close();
} catch (FileNotFoundException e) {
- logger.log(Level.WARNING, "couldn't open configuration file \""+configPath+"\". quitting.");
- System.exit(-1);
+ LOG.error("couldn't open configuration file \"{}\". quitting.", configPath);
+ throw new RuntimeException("Cannot read HeidelTime configuration.");
} catch (IOException e) {
- logger.log(Level.WARNING, "couldn't close config file handle");
- e.printStackTrace();
+ LOG.warn("couldn't close config file handle", e);
}
}
diff --git a/src/de/unihd/dbs/heideltime/standalone/components/impl/StandaloneConfigContext.java b/src/de/unihd/dbs/heideltime/standalone/components/impl/StandaloneConfigContext.java
index e6edaaa2..a83933fa 100644
--- a/src/de/unihd/dbs/heideltime/standalone/components/impl/StandaloneConfigContext.java
+++ b/src/de/unihd/dbs/heideltime/standalone/components/impl/StandaloneConfigContext.java
@@ -1,172 +1,29 @@
package de.unihd.dbs.heideltime.standalone.components.impl;
-import java.io.InputStream;
-import java.net.URI;
-import java.net.URL;
-import java.util.HashMap;
+import org.apache.uima.impl.RootUimaContext_impl;
+import org.apache.uima.resource.ConfigurationManager;
+import org.apache.uima.resource.impl.ConfigurationManager_impl;
+import org.apache.uima.resource.impl.ResourceManager_impl;
-import org.apache.uima.UimaContext;
-import org.apache.uima.cas.AbstractCas;
-import org.apache.uima.cas.SofaID;
-import org.apache.uima.resource.ResourceAccessException;
-import org.apache.uima.resource.Session;
-import org.apache.uima.util.InstrumentationFacility;
-import org.apache.uima.util.Logger;
+/**
+ * UIMA context with manually set configuration manager.
+ */
+public class StandaloneConfigContext extends RootUimaContext_impl {
+ private ConfigurationManager mConfigManager;
-@SuppressWarnings("deprecation")
-public class StandaloneConfigContext implements UimaContext {
- private HashMap settings = new HashMap();
-
- @Override
- public Object getConfigParameterValue(String aParamName) {
- return settings.get(aParamName);
+ public StandaloneConfigContext() {
+ super();
+ mConfigManager = new ConfigurationManager_impl();
+ this.initializeRoot(null, new ResourceManager_impl(), mConfigManager);
+ mConfigManager.setSession(this.getSession());
}
- public void setConfigParameterValue(String aParamName, Object aParamValue) {
- settings.put(aParamName, aParamValue);
- }
-
- @Override
- public Object getConfigParameterValue(String aGroupName, String aParamName) {
- return settings.get(aParamName);
+ public void setConfigParameterValue(String key, Object val) {
+ mConfigManager.setConfigParameterValue(makeQualifiedName(key), val);
}
- /*
- * leave these defunct because we don't use them for now
- */
-
- @Override
- public String[] getConfigurationGroupNames() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public String[] getConfigParameterNames(String aGroup) {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public String[] getConfigParameterNames() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Logger getLogger() {
- // TODO Auto-generated method stub
- return null;
- }
-
@Override
- public InstrumentationFacility getInstrumentationFacility() {
- // TODO Auto-generated method stub
- return null;
+ public ConfigurationManager getConfigurationManager() {
+ return mConfigManager;
}
-
- @Override
- public URL getResourceURL(String aKey) throws ResourceAccessException {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public URI getResourceURI(String aKey) throws ResourceAccessException {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public String getResourceFilePath(String aKey)
- throws ResourceAccessException {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public InputStream getResourceAsStream(String aKey)
- throws ResourceAccessException {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Object getResourceObject(String aKey) throws ResourceAccessException {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public URL getResourceURL(String aKey, String[] aParams)
- throws ResourceAccessException {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public URI getResourceURI(String aKey, String[] aParams)
- throws ResourceAccessException {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public String getResourceFilePath(String aKey, String[] aParams)
- throws ResourceAccessException {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public InputStream getResourceAsStream(String aKey, String[] aParams)
- throws ResourceAccessException {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Object getResourceObject(String aKey, String[] aParams)
- throws ResourceAccessException {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public String getDataPath() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Session getSession() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public SofaID mapToSofaID(String aSofaName) {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public String mapSofaIDToComponentSofaName(String aSofaID) {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public SofaID[] getSofaMappings() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- @SuppressWarnings("rawtypes")
- public AbstractCas getEmptyCas(Class aCasInterface) {
- // TODO Auto-generated method stub
- return null;
- }
-
}
diff --git a/src/de/unihd/dbs/heideltime/standalone/components/impl/UimaContextImpl.java b/src/de/unihd/dbs/heideltime/standalone/components/impl/UimaContextImpl.java
index 1cd77f5c..72c4c947 100644
--- a/src/de/unihd/dbs/heideltime/standalone/components/impl/UimaContextImpl.java
+++ b/src/de/unihd/dbs/heideltime/standalone/components/impl/UimaContextImpl.java
@@ -20,7 +20,7 @@
import org.apache.uima.resource.impl.ResourceManager_impl;
import de.unihd.dbs.heideltime.standalone.Config;
-import de.unihd.dbs.heideltime.standalone.DocumentType;
+import de.unihd.dbs.uima.annotator.heideltime.DocumentType;
import de.unihd.dbs.uima.annotator.heideltime.resources.Language;
/**
diff --git a/src/de/unihd/dbs/heideltime/standalone/exceptions/DocumentCreationTimeMissingException.java b/src/de/unihd/dbs/heideltime/standalone/exceptions/DocumentCreationTimeMissingException.java
index 8e2b3449..0230cadd 100644
--- a/src/de/unihd/dbs/heideltime/standalone/exceptions/DocumentCreationTimeMissingException.java
+++ b/src/de/unihd/dbs/heideltime/standalone/exceptions/DocumentCreationTimeMissingException.java
@@ -10,12 +10,10 @@
*
* HeidelTime is a multilingual, cross-domain temporal tagger.
* For details, see http://dbs.ifi.uni-heidelberg.de/heideltime
- */
+ */
package de.unihd.dbs.heideltime.standalone.exceptions;
-import de.unihd.dbs.heideltime.standalone.DocumentType;
-
/**
* Exception thrown if document creation time is missing while processing a document of type {@link DocumentType#NEWS}
*
@@ -23,10 +21,8 @@
* @version 1.0
*/
public class DocumentCreationTimeMissingException extends Exception {
-
/**
*
*/
private static final long serialVersionUID = -157033697488394828L;
-
}
diff --git a/src/de/unihd/dbs/uima/annotator/alllanguagestokenizer/AllLanguagesTokenizer.java b/src/de/unihd/dbs/uima/annotator/alllanguagestokenizer/AllLanguagesTokenizer.java
index e312c2c8..5ddbf61e 100644
--- a/src/de/unihd/dbs/uima/annotator/alllanguagestokenizer/AllLanguagesTokenizer.java
+++ b/src/de/unihd/dbs/uima/annotator/alllanguagestokenizer/AllLanguagesTokenizer.java
@@ -8,6 +8,7 @@
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.jcas.JCas;
import de.unihd.dbs.uima.types.heideltime.Sentence;
@@ -60,135 +61,127 @@ public List tokenize(JCas jcas) {
if(line.matches("^<.*>$")) {
// SGML tag
outBuf.append(line + "\n");
- } else {
- // add a blank at the beginning and the end of each segment
- line = " " + line + " ";
-
- // insert missing blanks after punctuation
- line = line.replaceAll("\\.\\.\\.", " ... ");
- line = line.replaceAll("([;\\!\\?])([^ ])", "$1 $2");
- line = line.replaceAll("([.,:])([^ 0-9.])", "$1 $2");
-
- String[] lines = line.split(" ");
-
- for(String token : lines) {
- // remove some whitespaces that \s doesn't catch
- if(token.equals(""))
- continue;
-
- String suffix = "";
-
- // separate punctuation and parentheses from words
- Boolean finished = false;
- Matcher m;
- do {
- finished = true;
-
- // cut off preceding punctuation
- m = Pattern.compile("^([" + PChar + "])(.)").matcher(token);
- if(m.find()) {
- token = token.replaceAll("^([" + PChar + "])(.)", "$2");
- outBuf.append(m.group(1) + "\n");
- finished = false;
- }
-
- // cut off trailing punctuation
- m = Pattern.compile("(.)([" + FChar + "])$").matcher(token);
- if(m.find()) {
- token = token.replaceAll("(.)([" + FChar + "])$", "$1");
- suffix = m.group(2) + "\n" + suffix;
- finished = false;
- }
-
- // cut off trailing periods if punctuation precedes
- m = Pattern.compile("([" + FChar + "])\\.$").matcher(token);
- if(m.find()) {
- token = token.replaceAll("([" + FChar + "])\\.$", "");
- suffix = ".\n" + suffix;
-
- if(token.equals("")) {
- token = m.group(1);
- } else {
- suffix = m.group(1) + "\n" + suffix;
- }
-
- finished = false;
+ continue;
+ }
+ // add a blank at the beginning and the end of each segment
+ line = " " + line + " ";
+
+ // insert missing blanks after punctuation
+ line = line.replaceAll("\\.\\.\\.", " ... ");
+ line = line.replaceAll("([;\\!\\?])([^ ])", "$1 $2");
+ line = line.replaceAll("([.,:])([^ 0-9.])", "$1 $2");
+
+ String[] lines = line.split(" ");
+
+ for(String token : lines) {
+ // remove some whitespaces that \s doesn't catch
+ if(token.equals(""))
+ continue;
+
+ String suffix = "";
+
+ // separate punctuation and parentheses from words
+ boolean finished = false;
+ Matcher m;
+ do {
+ finished = true;
+
+ // cut off preceding punctuation
+ m = Pattern.compile("^([" + PChar + "])(.)").matcher(token);
+ if(m.find()) {
+ token = token.replaceAll("^([" + PChar + "])(.)", "$2");
+ outBuf.append(m.group(1) + "\n");
+ finished = false;
+ }
+
+ // cut off trailing punctuation
+ m = Pattern.compile("(.)([" + FChar + "])$").matcher(token);
+ if(m.find()) {
+ token = token.replaceAll("(.)([" + FChar + "])$", "$1");
+ suffix = m.group(2) + "\n" + suffix;
+ finished = false;
+ }
+
+ // cut off trailing periods if punctuation precedes
+ m = Pattern.compile("([" + FChar + "])\\.$").matcher(token);
+ if(m.find()) {
+ token = token.replaceAll("([" + FChar + "])\\.$", "");
+ suffix = ".\n" + suffix;
+
+ if(token.equals("")) {
+ token = m.group(1);
+ } else {
+ suffix = m.group(1) + "\n" + suffix;
}
- } while(!finished);
- /* TODO:commented out because those are language-specific
+
+ finished = false;
+ }
+ } while(!finished);
+ /* TODO:commented out because those are language-specific
// handle explicitly listed tokens
if(abbreviations.contains(token)) {
outBuf.append(token + "\n" + suffix);
continue;
}*/
-
- // abbreviations of the form A. or U.S.A.
- if(token.matches("^([A-Za-z-]\\.)+$")) {
- outBuf.append(token + "\n" + suffix);
- continue;
- }
-
- // disambiguate periods
- m = Pattern.compile("^(..*)\\.$").matcher(token);
- if(m.matches() && !line.equals("...")
- /* TODO:commented out because those are language-specific: && !(flags.contains(Flag.GALICIAN) && token.matches("^[0-9]+\\.$"))*/) {
- token = m.group(1);
- suffix = ".\n" + suffix;
- /* TODO:commented out because those are language-specific
+
+ // abbreviations of the form A. or U.S.A.
+ if(token.matches("^([A-Za-z-]\\.)+$")) {
+ outBuf.append(token + "\n" + suffix);
+ continue;
+ }
+
+ // disambiguate periods
+ m = Pattern.compile("^(..*)\\.$").matcher(token);
+ if(m.matches() && !line.equals("...")
+ /* TODO:commented out because those are language-specific: && !(flags.contains(Flag.GALICIAN) && token.matches("^[0-9]+\\.$"))*/) {
+ token = m.group(1);
+ suffix = ".\n" + suffix;
+ /* TODO:commented out because those are language-specific
if(abbreviations.contains(token)) {
outBuf.append(token + "\n" + suffix);
continue;
}*/
- }
-
- // cut off clitics
+ }
+
+ // cut off clitics
+ while(true) {
+ m = Pattern.compile("^(--)(.)").matcher(token);
+ if(!m.find())
+ break;
+
+ token = token.replaceAll("^(--)(.)", "$2");
+ outBuf.append(m.group(1) + "\n");
+ }
+ if(!PClitic.equals("")) {
while(true) {
- m = Pattern.compile("^(--)(.)").matcher(token);
-
- if(!m.find()) {
+ m = Pattern.compile("^(" + PClitic + ")(.)").matcher(token);
+ if(!m.find())
break;
- }
-
- token = token.replaceAll("^(--)(.)", "$2");
+
+ token = token.replaceAll("^(" + PClitic + ")(.)", "$2");
outBuf.append(m.group(1) + "\n");
}
- if(!PClitic.equals("")) {
- while(true) {
- m = Pattern.compile("^(" + PClitic + ")(.)").matcher(token);
-
- if(!m.find()) {
- break;
- }
-
- token = token.replaceAll("^(" + PClitic + ")(.)", "$2");
- outBuf.append(m.group(1) + "\n");
- }
- }
-
+ }
+
+ while(true) {
+ m = Pattern.compile("^(--)(.)").matcher(token);
+ if(!m.find())
+ break;
+
+ token = token.replaceAll("^(--)(.)", "$1");
+ suffix = m.group(2) + "\n" + suffix;
+ }
+ if(!FClitic.equals("")) {
while(true) {
- m = Pattern.compile("^(--)(.)").matcher(token);
-
- if(!m.find()) {
+ m = Pattern.compile("(.)(" + FClitic + ")$").matcher(token);
+ if(!m.find())
break;
- }
-
- token = token.replaceAll("^(--)(.)", "$1");
+
+ token = token.replaceAll("(.)(" + FClitic + ")$", "$1");
suffix = m.group(2) + "\n" + suffix;
}
- if(!FClitic.equals("")) {
- while(true) {
- m = Pattern.compile("(.)(" + FClitic + ")$").matcher(token);
-
- if(!m.find()) {
- break;
- }
-
- token = token.replaceAll("(.)(" + FClitic + ")$", "$1");
- suffix = m.group(2) + "\n" + suffix;
- }
- }
- outBuf.append(token + "\n" + suffix);
}
+ outBuf.append(token + "\n" + suffix);
}
}
}
@@ -196,11 +189,11 @@ public List tokenize(JCas jcas) {
// find the tokens in the original text and create token annotations
LinkedList outList = new LinkedList();
String origText = jcas.getDocumentText();
- Integer origTextOffset = 0;
+ int origTextOffset = 0;
for(String s : outBuf.toString().split("\n")) {
- Integer begin = origText.indexOf(s, origTextOffset);
- Integer end = begin + s.length();
+ int begin = origText.indexOf(s, origTextOffset);
+ int end = begin + s.length();
Token t = new Token(jcas);
t.setBegin(begin);
@@ -219,17 +212,17 @@ public List tokenize(JCas jcas) {
public List sentenceTokenize(JCas jcas) {
List outList = new LinkedList();
- FSIterator tokIt = jcas.getAnnotationIndex(Token.type).iterator();
+ AnnotationIndex tokens = jcas.getAnnotationIndex(Token.type);
+ FSIterator tokIt = tokens.iterator();
Sentence s = new Sentence(jcas);
- Boolean sentenceStarted = false;
+ boolean sentenceStarted = false;
Token tOld = null;
Token t = null;
while(tokIt.hasNext()) {
- if (!(t == null)){
+ if (t != null)
tOld = t;
- }
- t = (Token) tokIt.next();
+ t = tokIt.next();
// set sentence beginning
if(sentenceStarted == false) {
@@ -243,8 +236,7 @@ public List sentenceTokenize(JCas jcas) {
*/
if(!tokIt.hasNext() ||
(t.getCoveredText().matches("[.:!\\?]+") &&
- (!((tOld != null && tOld.getCoveredText().matches("[\\d]+")) ||
- ((jcas.getDocumentText().substring(t.getEnd()).length() > 2) && (jcas.getDocumentText().substring(t.getEnd(),t.getEnd()+3)).matches(" [A-Z][.-]")))))){
+ !((tOld != null && tOld.getCoveredText().matches("[\\d]+")) || (jcas.getDocumentText().substring(t.getEnd()).length() > 2 && jcas.getDocumentText().substring(t.getEnd(),t.getEnd()+3).matches(" [A-Z][.-]"))))){
// ((!(tOld.getCoveredText().matches("[\\d]+")))) && (!((jcas.getDocumentText().substring(t.getEnd())).matches("^[\\s]*"))))) {
// (t.getCoveredText().matches("[.:!\\?]+") && (!(tOld.getCoveredText().matches("[\\d]+"))))) { // das funktioniert ok
sentenceStarted = false;
@@ -252,7 +244,7 @@ public List sentenceTokenize(JCas jcas) {
// check for whether the punctuation mark is followed by a closing quotation mark
if(tokIt.hasNext()) {
- Token tNext = (Token) tokIt.next();
+ Token tNext = tokIt.next();
if(tNext.getCoveredText().matches("[»’'\"‛”‟›〞』」﹄"'」﹂]+")) {
s.setEnd(tNext.getEnd());
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/DocumentType.java b/src/de/unihd/dbs/uima/annotator/heideltime/DocumentType.java
new file mode 100644
index 00000000..e3a5bc3b
--- /dev/null
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/DocumentType.java
@@ -0,0 +1,34 @@
+package de.unihd.dbs.uima.annotator.heideltime;
+
+/**
+ * Heideltime document types.
+ */
+public enum DocumentType {
+ COLLOQUIAL("colloquial"), NEWS("news"), NARRATIVE("narrative"), SCIENTIFIC("scientific");
+ String name;
+
+ DocumentType(String name) {
+ this.name = name;
+ }
+
+ @Override
+ public String toString() {
+ return name;
+ }
+
+ public static DocumentType of(String s) {
+ switch (s) {
+ case "colloquial":
+ return COLLOQUIAL;
+ case "news":
+ return NEWS;
+ case "narrative":
+ case "narratives":
+ return NARRATIVE;
+ case "scientific":
+ return SCIENTIFIC;
+ default:
+ throw new IllegalArgumentException("Unknown document type: " + s);
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/HeidelTime.java b/src/de/unihd/dbs/uima/annotator/heideltime/HeidelTime.java
index e94f96b2..c9055343 100644
--- a/src/de/unihd/dbs/uima/annotator/heideltime/HeidelTime.java
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/HeidelTime.java
@@ -14,92 +14,97 @@
package de.unihd.dbs.uima.annotator.heideltime;
+import static de.unihd.dbs.uima.annotator.heideltime.utilities.ParseInteger.parseInt;
+import static de.unihd.dbs.uima.annotator.heideltime.utilities.ParseInteger.parseIntAt;
+
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
-import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.regex.MatchResult;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.AnalysisComponent;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import de.unihd.dbs.uima.annotator.heideltime.ProcessorManager.Priority;
import de.unihd.dbs.uima.annotator.heideltime.processors.TemponymPostprocessing;
import de.unihd.dbs.uima.annotator.heideltime.resources.Language;
import de.unihd.dbs.uima.annotator.heideltime.resources.NormalizationManager;
import de.unihd.dbs.uima.annotator.heideltime.resources.RePatternManager;
-import de.unihd.dbs.uima.annotator.heideltime.resources.RegexHashMap;
+import de.unihd.dbs.uima.annotator.heideltime.resources.Rule;
+import de.unihd.dbs.uima.annotator.heideltime.resources.RuleExpansion;
import de.unihd.dbs.uima.annotator.heideltime.resources.RuleManager;
-import de.unihd.dbs.uima.annotator.heideltime.utilities.DateCalculator;
-import de.unihd.dbs.uima.annotator.heideltime.utilities.ContextAnalyzer;
+import de.unihd.dbs.uima.annotator.heideltime.utilities.DurationSimplification;
import de.unihd.dbs.uima.annotator.heideltime.utilities.LocaleException;
-import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
-import de.unihd.dbs.uima.annotator.heideltime.utilities.Toolbox;
-import de.unihd.dbs.uima.types.heideltime.Dct;
+import de.unihd.dbs.uima.annotator.heideltime.utilities.TokenBoundaryMatcher;
import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Timex3;
import de.unihd.dbs.uima.types.heideltime.Token;
-
/**
- * HeidelTime finds temporal expressions and normalizes them according to the TIMEX3
- * TimeML annotation standard.
+ * HeidelTime finds temporal expressions and normalizes them according to the TIMEX3 TimeML annotation standard.
*
* @author jannik stroetgen
*
*/
public class HeidelTime extends JCasAnnotator_ImplBase {
+ /** Class logger */
+ private static final Logger LOG = LoggerFactory.getLogger(HeidelTime.class);
- // TOOL NAME (may be used as componentId)
- private Class> component = this.getClass();
-
// PROCESSOR MANAGER
private ProcessorManager procMan = new ProcessorManager();
// COUNTER (how many timexes added to CAS? (finally)
- public int timex_counter = 0;
+ public int timex_counter = 0;
public int timex_counter_global = 0;
-
- // FLAG (for historic expressions referring to BC)
- public Boolean flagHistoricDates = false;
-
+
// COUNTER FOR TIMEX IDS
private int timexID = 0;
-
+
// INPUT PARAMETER HANDLING WITH UIMA
- private String PARAM_LANGUAGE = "Language";
- // supported languages (2012-05-19): english, german, dutch, englishcoll, englishsci
- private String PARAM_TYPE_TO_PROCESS = "Type";
+ private String PARAM_LANGUAGE = "Language";
+ // supported languages (2012-05-19): english, german, dutch, englishcoll,
+ // englishsci
+ private String PARAM_TYPE_TO_PROCESS = "Type";
// chosen locale parameter name
- private String PARAM_LOCALE = "locale";
- // supported types (2012-05-19): news (english, german, dutch), narrative (english, german, dutch), colloquial
- private Language language = Language.ENGLISH;
- private String typeToProcess = "news";
-
+ private String PARAM_LOCALE = "locale";
+ // supported types (2012-05-19): news (english, german, dutch), narrative
+ // (english, german, dutch), colloquial
+ private Language language = Language.ENGLISH;
+ private DocumentType typeToProcess = DocumentType.NEWS;
+
// INPUT PARAMETER HANDLING WITH UIMA (which types shall be extracted)
- private String PARAM_DATE = "Date";
- private String PARAM_TIME = "Time";
- private String PARAM_DURATION = "Duration";
- private String PARAM_SET = "Set";
+ private String PARAM_DATE = "Date";
+ private String PARAM_TIME = "Time";
+ private String PARAM_DURATION = "Duration";
+ private String PARAM_SET = "Set";
private String PARAM_TEMPONYMS = "Temponym";
- private String PARAM_DEBUG = "Debugging";
- private String PARAM_GROUP = "ConvertDurations";
- private Boolean find_dates = true;
- private Boolean find_times = true;
- private Boolean find_durations = true;
- private Boolean find_sets = true;
- private Boolean find_temponyms = false;
- private Boolean group_gran = true;
+ private String PARAM_GROUP = "ConvertDurations";
+ private boolean find_dates = true;
+ private boolean find_times = true;
+ private boolean find_durations = true;
+ private boolean find_sets = true;
+ private boolean find_temponyms = false;
+ private boolean group_gran = true;
// FOR DEBUGGING PURPOSES (IF FALSE)
- private Boolean deleteOverlapped = true;
+ private boolean deleteOverlapping = true;
+
+ // Whether to generate "allTokIds" strings.
+ // Required for TempEval!
+ private boolean doAllTokIds = true;
+ private ResolveAmbiguousValues resolver;
/**
* @see AnalysisComponent#initialize(UimaContext)
@@ -108,222 +113,204 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept
super.initialize(aContext);
/////////////////////////////////
- // DEBUGGING PARAMETER SETTING //
- /////////////////////////////////
- this.deleteOverlapped = true;
- Boolean doDebug = (Boolean) aContext.getConfigParameterValue(PARAM_DEBUG);
- Logger.setPrintDetails(doDebug == null ? false : doDebug);
-
- /////////////////////////////////
- // HANDLE LOCALE //
+ // HANDLE LOCALE //
/////////////////////////////////
String requestedLocale = (String) aContext.getConfigParameterValue(PARAM_LOCALE);
- if(requestedLocale == null || requestedLocale.length() == 0) { // if the PARAM_LOCALE setting was left empty,
- Locale.setDefault(Locale.UK); // use a default, the ISO8601-adhering UK locale (equivalent to "en_GB")
- } else { // otherwise, check if the desired locale exists in the JVM's available locale repertoire
+ if (requestedLocale == null || requestedLocale.length() == 0) {
+ // if the PARAM_LOCALE setting was left empty,
+ Locale.setDefault(Locale.UK);
+ // use the ISO8601-adhering UK locale (equivalent to "en_GB")
+ } else { // otherwise, check if the desired locale exists in the JVM's
+ // available locale repertoire
try {
- Locale locale = DateCalculator.getLocaleFromString(requestedLocale);
- Locale.setDefault(locale); // sets it for the entire JVM session
+ Locale locale = getLocaleFromString(requestedLocale);
+ Locale.setDefault(locale); // sets it for the entire JVM
+ // session
} catch (LocaleException e) {
- Logger.printError("Supplied locale parameter couldn't be resolved to a working locale. Try one of these:");
- String localesString = new String();
- for(Locale l : Locale.getAvailableLocales()) { // list all available locales
- localesString += l.toString()+" ";
+ StringBuilder localesString = new StringBuilder();
+ localesString.append("Supplied locale parameter couldn't be resolved to a working locale. Try one of these:");
+ for (Locale l : Locale.getAvailableLocales()) {
+ // list all available locales
+ localesString.append(l.toString()).append(' ');
}
- Logger.printError(localesString);
- System.exit(-1);
+ LOG.error(localesString.toString());
+ System.exit(1);
}
}
-
+
//////////////////////////////////
// GET CONFIGURATION PARAMETERS //
//////////////////////////////////
language = Language.getLanguageFromString((String) aContext.getConfigParameterValue(PARAM_LANGUAGE));
-
- typeToProcess = (String) aContext.getConfigParameterValue(PARAM_TYPE_TO_PROCESS);
- find_dates = (Boolean) aContext.getConfigParameterValue(PARAM_DATE);
- find_times = (Boolean) aContext.getConfigParameterValue(PARAM_TIME);
+
+ typeToProcess = DocumentType.of((String) aContext.getConfigParameterValue(PARAM_TYPE_TO_PROCESS));
+ find_dates = (Boolean) aContext.getConfigParameterValue(PARAM_DATE);
+ find_times = (Boolean) aContext.getConfigParameterValue(PARAM_TIME);
find_durations = (Boolean) aContext.getConfigParameterValue(PARAM_DURATION);
- find_sets = (Boolean) aContext.getConfigParameterValue(PARAM_SET);
+ find_sets = (Boolean) aContext.getConfigParameterValue(PARAM_SET);
find_temponyms = (Boolean) aContext.getConfigParameterValue(PARAM_TEMPONYMS);
- group_gran = (Boolean) aContext.getConfigParameterValue(PARAM_GROUP);
+ group_gran = (Boolean) aContext.getConfigParameterValue(PARAM_GROUP);
////////////////////////////////////////////////////////////
// READ NORMALIZATION RESOURCES FROM FILES AND STORE THEM //
////////////////////////////////////////////////////////////
NormalizationManager.getInstance(language, find_temponyms);
-
+
//////////////////////////////////////////////////////
// READ PATTERN RESOURCES FROM FILES AND STORE THEM //
//////////////////////////////////////////////////////
RePatternManager.getInstance(language, find_temponyms);
-
+
///////////////////////////////////////////////////
// READ RULE RESOURCES FROM FILES AND STORE THEM //
///////////////////////////////////////////////////
RuleManager.getInstance(language, find_temponyms);
-
- /////////////////////////////////////////////////////////////////////////////////
- // SUBPROCESSOR CONFIGURATION. REGISTER YOUR OWN PROCESSORS HERE FOR EXECUTION //
- /////////////////////////////////////////////////////////////////////////////////
- procMan.registerProcessor("de.unihd.dbs.uima.annotator.heideltime.processors.HolidayProcessor");
- procMan.registerProcessor("de.unihd.dbs.uima.annotator.heideltime.processors.DecadeProcessor");
+
+ ///////////////////////////////////////////////////////////////////
+ // SUBPROCESSOR CONFIGURATION. REGISTER YOUR OWN PROCESSORS HERE //
+ ///////////////////////////////////////////////////////////////////
+ procMan.registerProcessor(de.unihd.dbs.uima.annotator.heideltime.processors.HolidayProcessor.class.getName());
+ procMan.registerProcessor(de.unihd.dbs.uima.annotator.heideltime.processors.DecadeProcessor.class.getName());
procMan.initializeAllProcessors(aContext);
-
+
/////////////////////////////
// PRINT WHAT WILL BE DONE //
/////////////////////////////
- if (find_dates) Logger.printDetail("Getting Dates...");
- if (find_times) Logger.printDetail("Getting Times...");
- if (find_durations) Logger.printDetail("Getting Durations...");
- if (find_sets) Logger.printDetail("Getting Sets...");
- if (find_temponyms) Logger.printDetail("Getting Temponyms...");
+ LOG.debug("Enabled modules:{}{}{}{}{}", //
+ find_dates ? " dates" : "", //
+ find_times ? " times" : "", //
+ find_durations ? " durations" : "", //
+ find_sets ? " sets" : "", //
+ find_temponyms ? " temponyms" : "");
+
+ if (resolver == null)
+ resolver = new ResolveAmbiguousValues();
+ resolver.init(language, find_temponyms, typeToProcess);
}
-
/**
* @see JCasAnnotator_ImplBase#process(JCas)
*/
public void process(JCas jcas) {
- // check whether a given DCT (if any) is of the correct format and if not, skip this call
- if(!isValidDCT(jcas)) {
- Logger.printError(component, "The reader component of this workflow has set an incorrect DCT."
- + " HeidelTime expects either \"YYYYMMDD\" or \"YYYY-MM-DD...\". This document was skipped.");
+ // check whether a given DCT (if any) is of the correct format and if not,skip this call
+ if (!ResolveAmbiguousValues.ParsedDct.isValidDCT(jcas)) {
+ LOG.error("The reader component of this workflow has set an incorrect DCT.\n" + //
+ " HeidelTime expects either \"YYYYMMDD\" or \"YYYY-MM-DD...\", got \"{}\".\n" + //
+ "This document was skipped.", ResolveAmbiguousValues.ParsedDct.getDct(jcas));
return;
}
-
+
// run preprocessing processors
procMan.executeProcessors(jcas, Priority.PREPROCESSING);
-
+
RuleManager rulem = RuleManager.getInstance(language, find_temponyms);
-
+
timexID = 1; // reset counter once per document processing
timex_counter = 0;
- flagHistoricDates = false;
-
+ boolean flagHistoricDates = false;
+
////////////////////////////////////////////
// CHECK SENTENCE BY SENTENCE FOR TIMEXES //
////////////////////////////////////////////
- FSIterator sentIter = jcas.getAnnotationIndex(Sentence.type).iterator();
- /*
- * check if the pipeline has annotated any sentences. if not, heideltime can't do any work,
- * will return from process() with a warning message.
+ AnnotationIndex sentences = jcas.getAnnotationIndex(Sentence.type);
+ /*
+ * check if the pipeline has annotated any sentences. if not, heideltime can't do any work, will return from process() with a warning message.
*/
- if(!sentIter.hasNext()) {
- Logger.printError(component, "HeidelTime has not found any sentence tokens in this document. " +
- "HeidelTime needs sentence tokens tagged by a preprocessing UIMA analysis engine to " +
- "do its work. Please check your UIMA workflow and add an analysis engine that creates " +
- "these sentence tokens.");
+ if (sentences.size() == 0) {
+ LOG.error("HeidelTime has not found any sentence tokens in this document. " + "HeidelTime needs sentence tokens tagged by a preprocessing UIMA analysis engine to "
+ + "do its work. Please check your UIMA workflow and add an analysis engine that creates " + "these sentence tokens.");
}
-
- while (sentIter.hasNext()) {
- Sentence s = (Sentence) sentIter.next();
-
- Boolean debugIteration = false;
- Boolean oldDebugState = Logger.getPrintDetails();
- do {
- try {
- if (find_dates) {
- findTimexes("DATE", rulem.getHmDatePattern(), rulem.getHmDateOffset(), rulem.getHmDateNormalization(), s, jcas);
- }
- if (find_times) {
- findTimexes("TIME", rulem.getHmTimePattern(), rulem.getHmTimeOffset(), rulem.getHmTimeNormalization(), s, jcas);
- }
-
- /*
- * check for historic dates/times starting with BC
- * to check if post-processing step is required
- */
- if (typeToProcess.equals("narrative") || typeToProcess.equals("narratives")){
- FSIterator iterDates = jcas.getAnnotationIndex(Timex3.type).iterator();
- while (iterDates.hasNext()){
- Timex3 t = (Timex3) iterDates.next();
- if (t.getTimexValue().startsWith("BC")){
- flagHistoricDates = true;
- break;
- }
- }
- }
-
- if (find_sets) {
- findTimexes("SET", rulem.getHmSetPattern(), rulem.getHmSetOffset(), rulem.getHmSetNormalization(), s, jcas);
- }
- if (find_durations) {
- findTimexes("DURATION", rulem.getHmDurationPattern(), rulem.getHmDurationOffset(), rulem.getHmDurationNormalization(), s, jcas);
- }
- if (find_temponyms) {
- findTimexes("TEMPONYM", rulem.getHmTemponymPattern(), rulem.getHmTemponymOffset(), rulem.getHmTemponymNormalization(), s, jcas);
- }
- } catch(NullPointerException npe) {
- if(!debugIteration) {
- debugIteration = true;
- Logger.setPrintDetails(true);
-
- Logger.printError(component, "HeidelTime's execution has been interrupted by an exception that " +
- "is likely rooted in faulty normalization resource files. Please consider opening an issue " +
- "report containing the following information at our GitHub project issue tracker: " +
- "https://github.com/HeidelTime/heideltime/issues - Thanks!");
- npe.printStackTrace();
- Logger.printError(component, "Sentence [" + s.getBegin() + "-" + s.getEnd() + "]: " + s.getCoveredText());
- Logger.printError(component, "Language: " + language);
- Logger.printError(component, "Re-running this sentence with DEBUGGING enabled...");
- } else {
- debugIteration = false;
- Logger.setPrintDetails(oldDebugState);
-
- Logger.printError(component, "Execution will now resume.");
- }
- }
- } while(debugIteration);
+
+ TokenBoundaryMatcher matcher = new TokenBoundaryMatcher();
+ for (Sentence s : sentences) {
+ try {
+ final CharSequence coveredText = TokenBoundaryMatcher.simplifyString(s.getCoveredText());
+ if (LOG.isTraceEnabled())
+ LOG.trace("Sentence {}: {}", s.getSentenceId(), coveredText);
+
+ // Build a list of "good" token positions to anchor matches:
+ matcher.tokenBoundaries(coveredText, s, jcas);
+
+ if (find_dates)
+ findTimexes("DATE", rulem.getHmDateRules(), matcher, s, jcas, coveredText);
+ if (find_times)
+ findTimexes("TIME", rulem.getHmTimeRules(), matcher, s, jcas, coveredText);
+
+ /*
+ * check for historic dates/times starting with BC to check if post-processing step is required
+ */
+ if (typeToProcess == DocumentType.NARRATIVE) {
+ AnnotationIndex dates = jcas.getAnnotationIndex(Timex3.type);
+ for (Timex3 t : dates)
+ if (t.getTimexValue().startsWith("BC")) {
+ flagHistoricDates = true;
+ break;
+ }
+ }
+
+ if (find_sets)
+ findTimexes("SET", rulem.getHmSetRules(), matcher, s, jcas, coveredText);
+ if (find_durations)
+ findTimexes("DURATION", rulem.getHmDurationRules(), matcher, s, jcas, coveredText);
+ if (find_temponyms)
+ findTimexes("TEMPONYM", rulem.getHmTemponymRules(), matcher, s, jcas, coveredText);
+ } catch (NullPointerException npe) {
+ LOG.error("HeidelTime's execution has been interrupted by an exception that " + "is likely rooted in faulty normalization resource files. "
+ + "Please consider opening an issue report containing the following "
+ + "information at our GitHub project issue tracker (if possible, also enable debug logging): "
+ + "https://github.com/HeidelTime/heideltime/issues - Thanks!", npe);
+ LOG.error("Sentence [" + s.getBegin() + "-" + s.getEnd() + "]: " + s.getCoveredText());
+ LOG.error("Language: " + language);
+ // LOG.error("Re-running this sentence with DEBUGGING
+ // enabled...");
+ // TODO: add a flag to force-log debugging information?
+ }
}
/*
* kick out some overlapping expressions
*/
- if (deleteOverlapped == true)
- deleteOverlappedTimexesPreprocessing(jcas);
+ if (deleteOverlapping)
+ deleteOverlappingTimexesPreprocessing(jcas);
/*
- * specify ambiguous values, e.g.: specific year for date values of
- * format UNDEF-year-01-01; specific month for values of format UNDEF-last-month
+ * specify ambiguous values, e.g.: specific year for date values of format UNDEF-year-01-01; specific month for values of format UNDEF-last-month
*/
- specifyAmbiguousValues(jcas);
-
+ if (resolver != null)
+ resolver.specifyAmbiguousValues(jcas);
+
// disambiguate historic dates
// check dates without explicit hints to AD or BC if they might refer to BC dates
if (flagHistoricDates)
try {
disambiguateHistoricDates(jcas);
- } catch(Exception e) {
- Logger.printError("Something went wrong disambiguating historic dates.");
- e.printStackTrace();
+ } catch (Exception e) {
+ LOG.error("Failed disambiguating historic dates: {}", e.getMessage(), e);
}
- if (find_temponyms) {
+ if (find_temponyms)
TemponymPostprocessing.handleIntervals(jcas);
- }
-
+
/*
* kick out the rest of the overlapping expressions
*/
- if (deleteOverlapped == true)
+ if (deleteOverlapping)
deleteOverlappedTimexesPostprocessing(jcas);
-
+
// run arbitrary processors
procMan.executeProcessors(jcas, Priority.ARBITRARY);
-
+
// remove invalid timexes
removeInvalids(jcas);
-
+
// run postprocessing processors
procMan.executeProcessors(jcas, Priority.POSTPROCESSING);
- timex_counter_global = timex_counter_global + timex_counter;
- Logger.printDetail(component, "Number of Timexes added to CAS: "+timex_counter + "(global: "+timex_counter_global+")");
+ timex_counter_global += timex_counter;
+ LOG.info("Number of Timexes added to CAS: {} (global: {})", timex_counter, timex_counter_global);
}
-
/**
* Add timex annotation to CAS object.
*
@@ -335,1791 +322,386 @@ public void process(JCas jcas) {
* @param foundByRule
* @param jcas
*/
- public void addTimexAnnotation(String timexType, int begin, int end, Sentence sentence, String timexValue, String timexQuant,
- String timexFreq, String timexMod, String emptyValue, String timexId, String foundByRule, JCas jcas) {
-
+ public void addTimexAnnotation(String timexType, int begin, int end, Sentence sentence, String timexValue, String timexQuant, String timexFreq, String timexMod, String emptyValue,
+ String timexId, String foundByRule, JCas jcas) {
+
Timex3 annotation = new Timex3(jcas);
annotation.setBegin(begin);
annotation.setEnd(end);
annotation.setFilename(sentence.getFilename());
annotation.setSentId(sentence.getSentenceId());
-
+
annotation.setEmptyValue(emptyValue);
- FSIterator iterToken = jcas.getAnnotationIndex(Token.type).subiterator(sentence);
- String allTokIds = "";
- while (iterToken.hasNext()) {
- Token tok = (Token) iterToken.next();
- if (tok.getBegin() <= begin && tok.getEnd() > begin) {
- annotation.setFirstTokId(tok.getTokenId());
- allTokIds = "BEGIN<-->" + tok.getTokenId();
- }
- if ((tok.getBegin() > begin) && (tok.getEnd() <= end)) {
- allTokIds = allTokIds + "<-->" + tok.getTokenId();
+ AnnotationIndex tokens = jcas.getAnnotationIndex(Token.type);
+ if (doAllTokIds) {
+ StringBuilder allTokIds = new StringBuilder();
+ for (FSIterator iterToken = tokens.subiterator(sentence); iterToken.hasNext();) {
+ Token tok = iterToken.next();
+ if (tok.getBegin() <= begin && tok.getEnd() > begin) {
+ annotation.setFirstTokId(tok.getTokenId());
+ allTokIds.setLength(0);
+ allTokIds.append("BEGIN<-->").append(tok.getTokenId());
+ }
+ if ((tok.getBegin() > begin) && (tok.getEnd() <= end))
+ allTokIds.append("<-->").append(tok.getTokenId());
}
+ annotation.setAllTokIds(allTokIds.toString());
}
- annotation.setAllTokIds(allTokIds);
annotation.setTimexType(timexType);
annotation.setTimexValue(timexValue);
annotation.setTimexId(timexId);
annotation.setFoundByRule(foundByRule);
- if ((timexType.equals("DATE")) || (timexType.equals("TIME"))) {
- if ((timexValue.startsWith("X")) || (timexValue.startsWith("UNDEF"))) {
- annotation.setFoundByRule(foundByRule+"-relative");
+ if (timexType.equals("DATE") || timexType.equals("TIME")) {
+ if (timexValue.startsWith("X") || timexValue.startsWith("UNDEF")) {
+ annotation.setFoundByRule(foundByRule + "-relative");
} else {
- annotation.setFoundByRule(foundByRule+"-explicit");
+ annotation.setFoundByRule(foundByRule + "-explicit");
}
}
- if (!(timexQuant == null)) {
+ if (timexQuant != null)
annotation.setTimexQuant(timexQuant);
- }
- if (!(timexFreq == null)) {
+ if (timexFreq != null)
annotation.setTimexFreq(timexFreq);
- }
- if (!(timexMod == null)) {
+ if (timexMod != null)
annotation.setTimexMod(timexMod);
- }
annotation.addToIndexes();
this.timex_counter++;
-
- Logger.printDetail(annotation.getTimexId()+"EXTRACTION PHASE: "+" found by:"+annotation.getFoundByRule()+" text:"+annotation.getCoveredText());
- Logger.printDetail(annotation.getTimexId()+"NORMALIZATION PHASE:"+" found by:"+annotation.getFoundByRule()+" text:"+annotation.getCoveredText()+" value:"+annotation.getTimexValue());
-
+
+ if (LOG.isTraceEnabled()) {
+ LOG.trace(annotation.getTimexId() + " EXTRACTION PHASE: " + " found by:" + annotation.getFoundByRule() + " text:" + annotation.getCoveredText());
+ LOG.trace(annotation.getTimexId() + " NORMALIZATION PHASE:" + " found by:" + annotation.getFoundByRule() + " text:" + annotation.getCoveredText() + " value:"
+ + annotation.getTimexValue());
+ }
}
-
/**
- * Postprocessing: Check dates starting with "0" which were extracted without
- * explicit "AD" hints if it is likely that they refer to the respective date BC
+ * Postprocessing: Check dates starting with "0" which were extracted without explicit "AD" hints if it is likely that they refer to the respective date BC
*
* @param jcas
*/
- public void disambiguateHistoricDates(JCas jcas){
-
+ public void disambiguateHistoricDates(JCas jcas) {
// build up a list with all found TIMEX expressions
- List linearDates = new ArrayList();
- FSIterator iterTimex = jcas.getAnnotationIndex(Timex3.type).iterator();
+ AnnotationIndex annotations = jcas.getAnnotationIndex(Timex3.type);
// Create List of all Timexes of types "date" and "time"
- while (iterTimex.hasNext()) {
- Timex3 timex = (Timex3) iterTimex.next();
- if (timex.getTimexType().equals("DATE") || timex.getTimexType().equals("TIME")) {
+ List linearDates = new ArrayList();
+ for (Timex3 timex : annotations)
+ if (timex.getTimexType().equals("DATE") || timex.getTimexType().equals("TIME"))
linearDates.add(timex);
- }
- }
-
- //////////////////////////////////////////////
- // go through list of Date and Time timexes //
- //////////////////////////////////////////////
+
+ //////////////////////////////////////////////
+ // go through list of Date and Time timexes //
+ //////////////////////////////////////////////
for (int i = 1; i < linearDates.size(); i++) {
- Timex3 t_i = (Timex3) linearDates.get(i);
- String value_i = t_i.getTimexValue();
- String newValue = value_i;
- Boolean change = false;
- if (!(t_i.getFoundByRule().contains("-BCADhint"))){
- if (value_i.startsWith("0")){
- Integer offset = 1, counter = 1;
- do {
- if ((i == 1 || (i > 1 && !change)) && linearDates.get(i-offset).getTimexValue().startsWith("BC")){
- if (value_i.length()>1){
- if ((linearDates.get(i-offset).getTimexValue().startsWith("BC"+value_i.substring(0,2))) ||
- (linearDates.get(i-offset).getTimexValue().startsWith("BC"+String.format("%02d",(Integer.parseInt(value_i.substring(0,2))+1))))){
- if (((value_i.startsWith("00")) && (linearDates.get(i-offset).getTimexValue().startsWith("BC00"))) ||
- ((value_i.startsWith("01")) && (linearDates.get(i-offset).getTimexValue().startsWith("BC01")))){
- if ((value_i.length()>2) && (linearDates.get(i-offset).getTimexValue().length()>4)){
- if (Integer.parseInt(value_i.substring(0,3)) <= Integer.parseInt(linearDates.get(i-offset).getTimexValue().substring(2,5))){
- newValue = "BC" + value_i;
- change = true;
- Logger.printDetail("DisambiguateHistoricDates: "+value_i+" to "+newValue+". Expression "+t_i.getCoveredText()+" due to "+linearDates.get(i-offset).getCoveredText());
- }
- }
- }
- else{
+ Timex3 t_i = linearDates.get(i);
+ if (t_i.getFoundByRule().contains("-BCADhint"))
+ continue;
+ String value_i = t_i.getTimexValue(), newValue = value_i;
+ if (value_i.charAt(0) != '0')
+ continue;
+ boolean change = false;
+ int offset = 1, counter = 1;
+ do {
+ String txval = linearDates.get(i - offset).getTimexValue();
+ if ((i == 1 || (i > 1 && !change)) && txval.startsWith("BC")) {
+ if (value_i.length() > 1) {
+ if (txval.startsWith("BC" + value_i.substring(0, 2)) //
+ || txval.startsWith(String.format("BC%02d", parseInt(value_i, 0, 2) + 1))) {
+ if ((value_i.startsWith("00") && txval.startsWith("BC00")) || (value_i.startsWith("01") && txval.startsWith("BC01"))) {
+ if ((value_i.length() > 2) && (txval.length() > 4)) {
+ if (parseInt(value_i, 0, 3) <= parseInt(txval, 2, 5)) {
newValue = "BC" + value_i;
change = true;
- Logger.printDetail("DisambiguateHistoricDates: "+value_i+" to "+newValue+". Expression "+t_i.getCoveredText()+" due to "+linearDates.get(i-offset).getCoveredText());
+ if (LOG.isDebugEnabled())
+ LOG.debug("DisambiguateHistoricDates: " + value_i + " to " + newValue + ". Expression " + t_i.getCoveredText()
+ + " due to " + linearDates.get(i - offset).getCoveredText());
}
}
- }
- }
-
- if ((linearDates.get(i-offset).getTimexType().equals("TIME") || linearDates.get(i-offset).getTimexType().equals("DATE")) &&
- (linearDates.get(i-offset).getTimexValue().matches("^\\d.*"))) {
- counter++;
+ } else {
+ newValue = "BC" + value_i;
+ change = true;
+ if (LOG.isDebugEnabled())
+ LOG.debug("DisambiguateHistoricDates: " + value_i + " to " + newValue + ". Expression " + t_i.getCoveredText() + " due to "
+ + linearDates.get(i - offset).getCoveredText());
+ }
}
- } while (counter < 5 && ++offset < i);
+ }
}
- }
- if (!(newValue.equals(value_i))){
+
+ String txtype = linearDates.get(i - offset).getTimexType();
+ if ((txtype.equals("TIME") || txtype.equals("DATE")) && txval.matches("^\\d.*")) {
+ counter++;
+ }
+ } while (counter < 5 && ++offset < i);
+ if (!newValue.equals(value_i)) {
t_i.removeFromIndexes();
- Logger.printDetail("DisambiguateHistoricDates: value changed to BC");
+ LOG.debug("DisambiguateHistoricDates: value changed to BC");
t_i.setTimexValue(newValue);
t_i.addToIndexes();
linearDates.set(i, t_i);
}
- }
+ }
}
-
+
/**
- * Postprocessing: Remove invalid timex expressions. These are already
- * marked as invalid: timexValue().equals("REMOVE")
+ * Postprocessing: Remove invalid timex expressions. These are already marked as invalid: timexValue().equals("REMOVE")
*
* @param jcas
*/
public void removeInvalids(JCas jcas) {
-
/*
- * Iterate over timexes and add invalids to HashSet
- * (invalids cannot be removed directly since iterator is used)
+ * Iterate over timexes and add invalids to HashSet (invalids cannot be removed directly since iterator is used)
*/
- FSIterator iterTimex = jcas.getAnnotationIndex(Timex3.type).iterator();
+ AnnotationIndex timexes = jcas.getAnnotationIndex(Timex3.type);
HashSet hsTimexToRemove = new HashSet();
- while (iterTimex.hasNext()) {
- Timex3 timex = (Timex3) iterTimex.next();
- if (timex.getTimexValue().equals("REMOVE")) {
+ for (Timex3 timex : timexes)
+ if (timex.getTimexValue().equals("REMOVE"))
hsTimexToRemove.add(timex);
- }
- }
// remove invalids, finally
for (Timex3 timex3 : hsTimexToRemove) {
timex3.removeFromIndexes();
this.timex_counter--;
- Logger.printDetail(timex3.getTimexId()+" REMOVING PHASE: "+"found by:"+timex3.getFoundByRule()+" text:"+timex3.getCoveredText()+" value:"+timex3.getTimexValue());
+ if (LOG.isDebugEnabled())
+ LOG.debug("{} REMOVING PHASE: found by: {} text:{} value:{}", timex3.getTimexId(), timex3.getFoundByRule(), timex3.getCoveredText(), timex3.getTimexValue());
}
}
- @SuppressWarnings("unused")
- public String specifyAmbiguousValuesString(String ambigString, Timex3 t_i, Integer i, List linearDates, JCas jcas) {
- NormalizationManager norm = NormalizationManager.getInstance(language, find_temponyms);
-
- // //////////////////////////////////////
- // IS THERE A DOCUMENT CREATION TIME? //
- // //////////////////////////////////////
- boolean dctAvailable = false;
-
- // ////////////////////////////
- // DOCUMENT TYPE TO PROCESS //
- // //////////////////////////
- boolean documentTypeNews = false;
- boolean documentTypeNarrative = false;
- boolean documentTypeColloquial = false;
- boolean documentTypeScientific = false;
- if (typeToProcess.equals("news")) {
- documentTypeNews = true;
- }
- if (typeToProcess.equals("narrative")
- || typeToProcess.equals("narratives")) {
- documentTypeNarrative = true;
- }
- if (typeToProcess.equals("colloquial")) {
- documentTypeColloquial = true;
- }
- if (typeToProcess.equals("scientific")) {
- documentTypeScientific = true;
- }
-
- // get the dct information
- String dctValue = "";
- int dctCentury = 0;
- int dctYear = 0;
- int dctDecade = 0;
- int dctMonth = 0;
- int dctDay = 0;
- String dctSeason = "";
- String dctQuarter = "";
- String dctHalf = "";
- int dctWeekday = 0;
- int dctWeek = 0;
-
- // ////////////////////////////////////////////
- // INFORMATION ABOUT DOCUMENT CREATION TIME //
- // ////////////////////////////////////////////
- FSIterator dctIter = jcas.getAnnotationIndex(Dct.type).iterator();
- if (dctIter.hasNext()) {
- dctAvailable = true;
- Dct dct = (Dct) dctIter.next();
- dctValue = dct.getValue();
- // year, month, day as mentioned in the DCT
- if (dctValue.matches("\\d\\d\\d\\d\\d\\d\\d\\d")) {
- dctCentury = Integer.parseInt(dctValue.substring(0, 2));
- dctYear = Integer.parseInt(dctValue.substring(0, 4));
- dctDecade = Integer.parseInt(dctValue.substring(2, 3));
- dctMonth = Integer.parseInt(dctValue.substring(4, 6));
- dctDay = Integer.parseInt(dctValue.substring(6, 8));
-
- Logger.printDetail("dctCentury:" + dctCentury);
- Logger.printDetail("dctYear:" + dctYear);
- Logger.printDetail("dctDecade:" + dctDecade);
- Logger.printDetail("dctMonth:" + dctMonth);
- Logger.printDetail("dctDay:" + dctDay);
- } else {
- dctCentury = Integer.parseInt(dctValue.substring(0, 2));
- dctYear = Integer.parseInt(dctValue.substring(0, 4));
- dctDecade = Integer.parseInt(dctValue.substring(2, 3));
- dctMonth = Integer.parseInt(dctValue.substring(5, 7));
- dctDay = Integer.parseInt(dctValue.substring(8, 10));
-
- Logger.printDetail("dctCentury:" + dctCentury);
- Logger.printDetail("dctYear:" + dctYear);
- Logger.printDetail("dctDecade:" + dctDecade);
- Logger.printDetail("dctMonth:" + dctMonth);
- Logger.printDetail("dctDay:" + dctDay);
- }
- dctQuarter = "Q"
- + norm.getFromNormMonthInQuarter(norm
- .getFromNormNumber(dctMonth + ""));
- dctHalf = "H1";
- if (dctMonth > 6) {
- dctHalf = "H2";
- }
-
- // season, week, weekday, have to be calculated
- dctSeason = norm.getFromNormMonthInSeason(norm
- .getFromNormNumber(dctMonth + "") + "");
- dctWeekday = DateCalculator.getWeekdayOfDate(dctYear + "-"
- + norm.getFromNormNumber(dctMonth + "") + "-"
- + norm.getFromNormNumber(dctDay + ""));
- dctWeek = DateCalculator.getWeekOfDate(dctYear + "-"
- + norm.getFromNormNumber(dctMonth + "") + "-"
- + norm.getFromNormNumber(dctDay + ""));
-
- Logger.printDetail("dctQuarter:" + dctQuarter);
- Logger.printDetail("dctSeason:" + dctSeason);
- Logger.printDetail("dctWeekday:" + dctWeekday);
- Logger.printDetail("dctWeek:" + dctWeek);
- } else {
- Logger.printDetail("No DCT available...");
- }
-
- // check if value_i has month, day, season, week (otherwise no UNDEF-year is possible)
- Boolean viHasMonth = false;
- Boolean viHasDay = false;
- Boolean viHasSeason = false;
- Boolean viHasWeek = false;
- Boolean viHasQuarter = false;
- Boolean viHasHalf = false;
- int viThisMonth = 0;
- int viThisDay = 0;
- String viThisSeason = "";
- String viThisQuarter = "";
- String viThisHalf = "";
- String[] valueParts = ambigString.split("-");
- // check if UNDEF-year or UNDEF-century
- if ((ambigString.startsWith("UNDEF-year")) || (ambigString.startsWith("UNDEF-century"))) {
- if (valueParts.length > 2) {
- // get vi month
- if (valueParts[2].matches("\\d\\d")) {
- viHasMonth = true;
- viThisMonth = Integer.parseInt(valueParts[2]);
- }
- // get vi season
- else if ((valueParts[2].equals("SP")) || (valueParts[2].equals("SU")) || (valueParts[2].equals("FA")) || (valueParts[2].equals("WI"))) {
- viHasSeason = true;
- viThisSeason = valueParts[2];
- }
- // get v1 quarter
- else if ((valueParts[2].equals("Q1")) || (valueParts[2].equals("Q2")) || (valueParts[2].equals("Q3")) || (valueParts[2].equals("Q4"))) {
- viHasQuarter = true;
- viThisQuarter = valueParts[2];
- }
- // get v1 half
- else if ((valueParts[2].equals("H1")) || (valueParts[2].equals("H2"))) {
- viHasHalf = true;
- viThisHalf = valueParts[2];
- }
- // get vi day
- if ((valueParts.length > 3) && (valueParts[3].matches("\\d\\d"))) {
- viHasDay = true;
- viThisDay = Integer.parseInt(valueParts[3]);
- }
+ /**
+ * @param jcas
+ */
+ private void deleteOverlappingTimexesPreprocessing(JCas jcas) {
+ AnnotationIndex timexes = jcas.getAnnotationIndex(Timex3.type);
+ HashSet hsTimexesToRemove = new HashSet();
+ for (Timex3 t1 : timexes) {
+ if (hsTimexesToRemove.contains(t1)) {
+ continue;
}
- }
- else {
- if (valueParts.length > 1) {
- // get vi month
- if (valueParts[1].matches("\\d\\d")) {
- viHasMonth = true;
- viThisMonth = Integer.parseInt(valueParts[1]);
+ for (Timex3 t2 : timexes) {
+ if (t1 == t2 || hsTimexesToRemove.contains(t2)) {
+ continue;
}
- // get vi season
- else if ((valueParts[1].equals("SP")) || (valueParts[1].equals("SU")) || (valueParts[1].equals("FA")) || (valueParts[1].equals("WI"))) {
- viHasSeason = true;
- viThisSeason = valueParts[1];
- }
- // get v1 quarter
- else if ((valueParts[1].equals("Q1")) || (valueParts[1].equals("Q2")) || (valueParts[1].equals("Q3")) || (valueParts[1].equals("Q4"))) {
- viHasQuarter = true;
- viThisQuarter = valueParts[1];
- }
- // get v1 half
- else if ((valueParts[1].equals("H1")) || (valueParts[1].equals("H2"))) {
- viHasHalf = true;
- viThisHalf = valueParts[1];
- }
- // get vi day
- if ((valueParts.length > 2) && (valueParts[2].matches("\\d\\d"))) {
- viHasDay = true;
- viThisDay = Integer.parseInt(valueParts[2]);
- }
- }
- }
- // get the last tense (depending on the part of speech tags used in front or behind the expression)
- String last_used_tense = ContextAnalyzer.getLastTense(t_i, jcas, language);
-
- //////////////////////////
- // DISAMBIGUATION PHASE //
- //////////////////////////
-
- ////////////////////////////////////////////////////
- // IF YEAR IS COMPLETELY UNSPECIFIED (UNDEF-year) //
- ////////////////////////////////////////////////////
- String valueNew = ambigString;
- if (ambigString.startsWith("UNDEF-year")) {
- String newYearValue = dctYear+"";
- // vi has month (ignore day)
- if ((viHasMonth == true) && (viHasSeason == false)) {
- // WITH DOCUMENT CREATION TIME
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- // Tense is FUTURE
- if ((last_used_tense.equals("FUTURE")) || (last_used_tense.equals("PRESENTFUTURE"))) {
- // if dct-month is larger than vi-month, than add 1 to dct-year
- if (dctMonth > viThisMonth) {
- int intNewYear = dctYear + 1;
- newYearValue = intNewYear + "";
- }
- }
- // Tense is PAST
- if ((last_used_tense.equals("PAST"))) {
- // if dct-month is smaller than vi month, than substrate 1 from dct-year
- if (dctMonth < viThisMonth) {
- int intNewYear = dctYear - 1;
- newYearValue = intNewYear + "";
- }
- }
+ if ( // t1 starts inside or with t2 and ends before t2 -> remove t1
+ ((t1.getBegin() >= t2.getBegin()) && (t1.getEnd() < t2.getEnd()))
+ // t1 starts inside t2 and ends with or before t2 -> remove t1
+ || ((t1.getBegin() > t2.getBegin()) && (t1.getEnd() <= t2.getEnd()))) {
+ logRemove(t1, "overlaps and begins later than", t2);
+ hsTimexesToRemove.add(t1);
+ continue;
}
- // WITHOUT DOCUMENT CREATION TIME
- else {
- newYearValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language);
+ // t2 starts inside or with t1 and ends before t1 -> remove t2
+ if (((t2.getBegin() >= t1.getBegin()) && (t2.getEnd() < t1.getEnd()))
+ // t2 starts inside t1 and ends with or before t1 -> remove t2
+ || ((t2.getBegin() > t1.getBegin()) && (t2.getEnd() <= t1.getEnd()))) {
+ logRemove(t2, "overlaps and begins later than", t1);
+ hsTimexesToRemove.add(t2);
+ continue;
}
- }
- // vi has quaurter
- if (viHasQuarter == true) {
- // WITH DOCUMENT CREATION TIME
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- // Tense is FUTURE
- if ((last_used_tense.equals("FUTURE")) || (last_used_tense.equals("PRESENTFUTURE"))) {
- if (Integer.parseInt(dctQuarter.substring(1)) < Integer.parseInt(viThisQuarter.substring(1))) {
- int intNewYear = dctYear + 1;
- newYearValue = intNewYear + "";
- }
- }
- // Tense is PAST
- if ((last_used_tense.equals("PAST"))) {
- if (Integer.parseInt(dctQuarter.substring(1)) < Integer.parseInt(viThisQuarter.substring(1))) {
- int intNewYear = dctYear - 1;
- newYearValue = intNewYear + "";
- }
- }
- // IF NO TENSE IS FOUND
- if (last_used_tense.equals("")){
- if (documentTypeColloquial){
- // IN COLLOQUIAL: future temporal expressions
- if (Integer.parseInt(dctQuarter.substring(1)) < Integer.parseInt(viThisQuarter.substring(1))){
- int intNewYear = dctYear + 1;
- newYearValue = intNewYear + "";
- }
- }
- else{
- // IN NEWS: past temporal expressions
- if (Integer.parseInt(dctQuarter.substring(1)) < Integer.parseInt(viThisQuarter.substring(1))){
- int intNewYear = dctYear - 1;
- newYearValue = intNewYear + "";
- }
- }
+ // identical length
+ if ((t1.getBegin() == t2.getBegin()) && (t1.getEnd() == t2.getEnd())) {
+ if (t1.getTimexValue().startsWith("UNDEF") && !t2.getTimexValue().startsWith("UNDEF")) {
+ logRemove(t1, "is UNDEF, compared to", t2);
+ hsTimexesToRemove.add(t1);
+ } else if (!t1.getTimexValue().startsWith("UNDEF") && t2.getTimexValue().startsWith("UNDEF")) {
+ logRemove(t2, "is UNDEF, compared to", t1);
+ hsTimexesToRemove.add(t2);
}
- }
- // WITHOUT DOCUMENT CREATION TIME
- else {
- newYearValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language);
- }
- }
- // vi has half
- if (viHasHalf == true) {
- // WITH DOCUMENT CREATION TIME
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- // Tense is FUTURE
- if ((last_used_tense.equals("FUTURE")) || (last_used_tense.equals("PRESENTFUTURE"))) {
- if (Integer.parseInt(dctHalf.substring(1)) < Integer.parseInt(viThisHalf.substring(1))) {
- int intNewYear = dctYear + 1;
- newYearValue = intNewYear + "";
- }
+ // t1 is explicit, but t2 is not
+ else if (t1.getFoundByRule().endsWith("explicit") && !t2.getFoundByRule().endsWith("explicit")) {
+ logRemove(t2, "is not explicit, compared to", t1);
+ hsTimexesToRemove.add(t2);
}
- // Tense is PAST
- if ((last_used_tense.equals("PAST"))) {
- if (Integer.parseInt(dctHalf.substring(1)) < Integer.parseInt(viThisHalf.substring(1))) {
- int intNewYear = dctYear - 1;
- newYearValue = intNewYear + "";
- }
+ // remove timexes that are identical, but one has an emptyvalue
+ else if (t2.getEmptyValue().isEmpty() && !t1.getEmptyValue().isEmpty()) {
+ logRemove(t2, "has emptyvalue, compared to", t1);
+ hsTimexesToRemove.add(t2);
}
- // IF NO TENSE IS FOUND
- if (last_used_tense.equals("")){
- if (documentTypeColloquial){
- // IN COLLOQUIAL: future temporal expressions
- if (Integer.parseInt(dctHalf.substring(1)) < Integer.parseInt(viThisHalf.substring(1))){
- int intNewYear = dctYear + 1;
- newYearValue = intNewYear + "";
- }
- }
- else{
- // IN NEWS: past temporal expressions
- if (Integer.parseInt(dctHalf.substring(1)) < Integer.parseInt(viThisHalf.substring(1))){
- int intNewYear = dctYear - 1;
- newYearValue = intNewYear + "";
- }
- }
+ // REMOVE REAL DUPLICATES (the one with the lower timexID)
+ else if (parseIntAt(t1.getTimexId(), 1) < parseIntAt(t2.getTimexId(), 1)) {
+ logRemove(t1, "has lower id value than", t2);
+ hsTimexesToRemove.add(t1);
}
}
- // WITHOUT DOCUMENT CREATION TIME
- else {
- newYearValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language);
- }
}
-
- // vi has season
- if ((viHasMonth == false) && (viHasDay == false) && (viHasSeason == true)) {
- // TODO check tenses?
- // WITH DOCUMENT CREATION TIME
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- newYearValue = dctYear+"";
- }
- // WITHOUT DOCUMENT CREATION TIME
- else {
- newYearValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language);
- }
+ }
+ // remove, finally
+ for (Timex3 t : hsTimexesToRemove) {
+ t.removeFromIndexes();
+ timex_counter--;
+ }
+ }
+
+ private void logRemove(Timex3 t1, String reason, Timex3 t2) {
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("DUPLICATE: {} (id:{} value:{} found by:{}) removed because it {} {} (id:{} value:{} found by:{})", //
+ t1.getCoveredText(), t1.getTimexId(), t1.getTimexValue(), t1.getFoundByRule(), //
+ reason, //
+ t2.getCoveredText(), t2.getTimexId(), t2.getTimexValue(), t2.getFoundByRule());
+ }
+ }
+
+ private void deleteOverlappedTimexesPostprocessing(JCas jcas) {
+ AnnotationIndex timexes = jcas.getAnnotationIndex(Timex3.type);
+ HashSet> effectivelyToInspect = new HashSet>();
+ ArrayList allTimexesToInspect = new ArrayList();
+ for (Timex3 myTimex : timexes) {
+ ArrayList timexSet = new ArrayList();
+ if (!myTimex.getTimexType().equals("TEMPONYM")) {
+ timexSet.add(myTimex);
}
- // vi has week
- if (viHasWeek) {
- // WITH DOCUMENT CREATION TIME
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- newYearValue = dctYear+"";
- }
- // WITHOUT DOCUMENT CREATION TIME
- else {
- newYearValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language);
+
+ // compare this timex to all other timexes and mark those that
+ // have an overlap
+ for (Timex3 myInnerTimex : timexes) {
+ if (!myInnerTimex.getTimexType().equals("TEMPONYM")) {
+ if (// timex1 starts, timex2 is partial overlap
+ (myTimex.getBegin() <= myInnerTimex.getBegin() && myTimex.getEnd() > myInnerTimex.getBegin()) ||
+ // same as above, but in reverse
+ (myInnerTimex.getBegin() <= myTimex.getBegin() && myInnerTimex.getEnd() > myTimex.getBegin()) ||
+ // timex 1 is contained within or identical to timex2
+ (myInnerTimex.getBegin() <= myTimex.getBegin() && myTimex.getEnd() <= myInnerTimex.getEnd()) ||
+ // same as above, but in reverse
+ (myTimex.getBegin() <= myInnerTimex.getBegin() && myInnerTimex.getEnd() <= myTimex.getEnd())) {
+
+ // increase the set
+ timexSet.add(myInnerTimex);
+ // note that these timexes are being looked at
+ allTimexesToInspect.add(myTimex);
+ allTimexesToInspect.add(myInnerTimex);
+ }
}
}
- // REPLACE THE UNDEF-YEAR WITH THE NEWLY CALCULATED YEAR AND ADD TIMEX TO INDEXES
- if (newYearValue.equals("")) {
- valueNew = ambigString.replaceFirst("UNDEF-year", "XXXX");
- }
- else {
- valueNew = ambigString.replaceFirst("UNDEF-year", newYearValue);
+ // if overlaps with myTimex were detected, memorize them
+ if (timexSet.size() > 1)
+ effectivelyToInspect.add(timexSet);
+ }
+
+ /*
+ * prune those sets of overlapping timexes that are subsets of others (i.e. leave only the largest union of overlapping timexes)
+ */
+ HashSet> newEffectivelyToInspect = new HashSet>();
+ for (Timex3 t : allTimexesToInspect) {
+ ArrayList setToKeep = new ArrayList();
+
+ // determine the largest set that contains this timex
+ for (ArrayList tSet : effectivelyToInspect) {
+ if (tSet.contains(t) && tSet.size() > setToKeep.size())
+ setToKeep = tSet;
}
+
+ newEffectivelyToInspect.add(setToKeep);
}
+ // overwrite previous list of sets
+ effectivelyToInspect = newEffectivelyToInspect;
- ///////////////////////////////////////////////////
- // just century is unspecified (UNDEF-century86) //
- ///////////////////////////////////////////////////
- else if ((ambigString.startsWith("UNDEF-century"))) {
- String newCenturyValue = dctCentury+"";
-
- // NEWS and COLLOQUIAL DOCUMENTS
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && !ambigString.equals("UNDEF-century")) {
- int viThisDecade = Integer.parseInt(ambigString.substring(13, 14));
-
- Logger.printDetail("dctCentury"+dctCentury);
-
- newCenturyValue = dctCentury+"";
- Logger.printDetail("dctCentury"+dctCentury);
-
- // Tense is FUTURE
- if ((last_used_tense.equals("FUTURE")) || (last_used_tense.equals("PRESENTFUTURE"))) {
- if (viThisDecade < dctDecade) {
- newCenturyValue = dctCentury + 1+"";
- } else {
- newCenturyValue = dctCentury+"";
- }
- }
- // Tense is PAST
- if ((last_used_tense.equals("PAST"))) {
- if (dctDecade < viThisDecade) {
- newCenturyValue = dctCentury - 1+"";
- } else {
- newCenturyValue = dctCentury+"";
- }
+ // iterate over the selected sets and merge information, remove old timexes
+ for (ArrayList tSet : effectivelyToInspect) {
+ Timex3 newTimex;
+
+ // if a timex has the timex value REMOVE, remove it from consideration
+ @SuppressWarnings("unchecked")
+ ArrayList newTSet = (ArrayList) tSet.clone();
+ for (Timex3 t : tSet) {
+ // remove timexes with value "REMOVE"
+ if (t.getTimexValue().equals("REMOVE")) {
+ newTSet.remove(t);
}
}
- // NARRATIVE DOCUMENTS
- else {
- newCenturyValue = ContextAnalyzer.getLastMentionedX(linearDates, i, "century", language);
- if (!(newCenturyValue.startsWith("BC"))){
- if ((newCenturyValue.matches("^\\d\\d.*")) && (Integer.parseInt(newCenturyValue.substring(0, 2)) < 10)){
- newCenturyValue = "00";
+ tSet = newTSet;
+
+ // iteration is done if all the timexes have been removed, i.e.
+ // the set is empty
+ if (tSet.size() == 0)
+ continue;
+
+ /*
+ * check - whether all timexes of this set have the same timex type attribute, - which one in the set has the longest value attribute string length, - what the combined extents
+ * are
+ */
+ boolean allSameTypes = true;
+ String timexType = null;
+ Timex3 longestTimex = null;
+ int combinedBegin = Integer.MAX_VALUE, combinedEnd = Integer.MIN_VALUE;
+ ArrayList tokenIds = new ArrayList();
+ for (Timex3 t : tSet) {
+ // check whether the types are identical and either all
+ // DATE or TIME
+ if (timexType == null) {
+ timexType = t.getTimexType();
+ } else {
+ if (allSameTypes && !timexType.equals(t.getTimexType()) || !(timexType.equals("DATE") || timexType.equals("TIME"))) {
+ allSameTypes = false;
}
- }else{
- newCenturyValue = "00";
}
- }
- if (newCenturyValue.equals("")){
- if (!(documentTypeNarrative)) {
- // always assume that sixties, twenties, and so on are 19XX if no century found (LREC change)
- valueNew = ambigString.replaceFirst("UNDEF-century", "19");
+ if (LOG.isTraceEnabled())
+ LOG.trace("Are these overlapping timexes of same type? => {}", allSameTypes);
+
+ // check timex value attribute string length
+ if (longestTimex == null) {
+ longestTimex = t;
+ } else if (allSameTypes && t.getFoundByRule().indexOf("-BCADhint") != -1) {
+ longestTimex = t;
+ } else if (allSameTypes && t.getFoundByRule().indexOf("relative") == -1 && longestTimex.getFoundByRule().indexOf("relative") != -1) {
+ longestTimex = t;
+ } else if (longestTimex.getTimexValue().length() == t.getTimexValue().length()) {
+ if (t.getBegin() < longestTimex.getBegin())
+ longestTimex = t;
+ } else if (longestTimex.getTimexValue().length() < t.getTimexValue().length()) {
+ longestTimex = t;
}
- // LREC change: assume in narrative-style documents that if no other century was mentioned before, 1st century
- else {
- valueNew = ambigString.replaceFirst("UNDEF-century", "00");
+ if (LOG.isTraceEnabled())
+ LOG.trace("Selected {}: {} [{}] as the longest-valued timex.", longestTimex.getTimexId(), longestTimex.getCoveredText(), longestTimex.getTimexValue());
+
+ // check combined beginning/end
+ if (combinedBegin > t.getBegin())
+ combinedBegin = t.getBegin();
+ if (combinedEnd < t.getEnd())
+ combinedEnd = t.getEnd();
+ if (LOG.isTraceEnabled())
+ LOG.trace("Selected combined constraints: {}:{}", combinedBegin, combinedEnd);
+
+ // disassemble and remember the token ids
+ if (doAllTokIds) {
+ String[] tokenizedTokenIds = t.getAllTokIds().split("<-->");
+ for (int i = 1; i < tokenizedTokenIds.length; i++) {
+ int tokid = parseInt(tokenizedTokenIds[i]);
+ if (!tokenIds.contains(tokid))
+ tokenIds.add(tokid);
+ }
}
}
- else {
- valueNew = ambigString.replaceFirst("UNDEF-century", newCenturyValue);
- }
- // always assume that sixties, twenties, and so on are 19XX -- if not narrative document (LREC change)
- if ((valueNew.matches("\\d\\d\\d")) && (!(documentTypeNarrative))) {
- valueNew = "19" + valueNew.substring(2);
+
+ /*
+ * types are equal => merge constraints, use the longer, "more granular" value. if types are not equal, just take the longest value.
+ */
+ Collections.sort(tokenIds);
+ newTimex = longestTimex;
+ if (allSameTypes) {
+ newTimex.setBegin(combinedBegin);
+ newTimex.setEnd(combinedEnd);
+ if (tokenIds.size() > 0)
+ newTimex.setFirstTokId(tokenIds.get(0));
+ String tokenIdText = "BEGIN";
+ for (Integer tokenId : tokenIds)
+ tokenIdText += "<-->" + tokenId;
+ newTimex.setAllTokIds(tokenIdText);
}
+
+ // remove old overlaps.
+ for (Timex3 t : tSet)
+ t.removeFromIndexes();
+ // add the single constructed/chosen timex to the indexes.
+ newTimex.addToIndexes();
}
-
- ////////////////////////////////////////////////////
- // CHECK IMPLICIT EXPRESSIONS STARTING WITH UNDEF //
- ////////////////////////////////////////////////////
- else if (ambigString.startsWith("UNDEF")) {
- valueNew = ambigString;
- if (ambigString.matches("^UNDEF-REFDATE$")){
- if (i > 0){
- Timex3 anyDate = linearDates.get(i-1);
- String lmDate = anyDate.getTimexValue();
- valueNew = lmDate;
- }
- else{
- valueNew = "XXXX-XX-XX";
- }
+ }
- //////////////////
- // TO CALCULATE //
- //////////////////
- // year to calculate
- } else if (ambigString.matches("^UNDEF-(this|REFUNIT|REF)-(.*?)-(MINUS|PLUS)-([0-9]+).*")) {
- for (MatchResult mr : Toolbox.findMatches(Pattern.compile("^(UNDEF-(this|REFUNIT|REF)-(.*?)-(MINUS|PLUS)-([0-9]+)).*"), ambigString)) {
- String checkUndef = mr.group(1);
- String ltn = mr.group(2);
- String unit = mr.group(3);
- String op = mr.group(4);
- String sDiff = mr.group(5);
- int diff = 0;
- try {
- diff = Integer.parseInt(sDiff);
- } catch (Exception e) {
- Logger.printError(component, "Expression difficult to normalize: ");
- Logger.printError(component, ambigString);
- Logger.printError(component, sDiff + " probably too long for parsing as integer.");
- Logger.printError(component, "set normalized value as PAST_REF / FUTURE_REF:");
- if (op.equals("PLUS")){
- valueNew = "FUTURE_REF";
- }
- else {
- valueNew = "PAST_REF";
- }
- break;
- }
-
-
- // do the processing for SCIENTIFIC documents (TPZ identification could be improved)
- if ((documentTypeScientific)){
- String opSymbol = "-";
- if (op.equals("PLUS")){
- opSymbol = "+";
- }
- if (unit.equals("year")){
- String diffString = diff+"";
- if (diff < 10){
- diffString = "000"+diff;
- }
- else if (diff < 100){
- diffString = "00"+diff;
- }
- else if (diff < 1000){
- diffString = "0"+diff;
- }
- valueNew = "TPZ"+opSymbol+diffString;
- }
- else if (unit.equals("month")){
- String diffString = diff+"";
- if (diff < 10){
- diffString = "0000-0"+diff;
- }
- else {
- diffString = "0000-"+diff;
- }
- valueNew = "TPZ"+opSymbol+diffString;
- }
- else if (unit.equals("week")){
- String diffString = diff+"";
- if (diff < 10){
- diffString = "0000-W0"+diff;
- }
- else {
- diffString = "0000-W"+diff;
- }
- valueNew = "TPZ"+opSymbol+diffString;
- }
- else if (unit.equals("day")){
- String diffString = diff+"";
- if (diff < 10){
- diffString = "0000-00-0"+diff;
- }
- else {
- diffString = "0000-00-"+diff;
- }
- valueNew = "TPZ"+opSymbol+diffString;
- }
- else if (unit.equals("hour")){
- String diffString = diff+"";
- if (diff < 10){
- diffString = "0000-00-00T0"+diff;
- }
- else {
- diffString = "0000-00-00T"+diff;
- }
- valueNew = "TPZ"+opSymbol+diffString;
- }
- else if (unit.equals("minute")){
- String diffString = diff+"";
- if (diff < 10){
- diffString = "0000-00-00T00:0"+diff;
- }
- else {
- diffString = "0000-00-00T00:"+diff;
- }
- valueNew = "TPZ"+opSymbol+diffString;
- }
- else if (unit.equals("second")){
- String diffString = diff+"";
- if (diff < 10){
- diffString = "0000-00-00T00:00:0"+diff;
- }
- else {
- diffString = "0000-00-00T00:00:"+diff;
- }
- valueNew = "TPZ"+opSymbol+diffString;
- }
- }
- else{
-
-
- // check for REFUNIT (only allowed for "year")
- if ((ltn.equals("REFUNIT")) && (unit.equals("year"))) {
- String dateWithYear = ContextAnalyzer.getLastMentionedX(linearDates, i, "dateYear", language);
- String year = dateWithYear;
- if (dateWithYear.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX");
- } else {
- if (dateWithYear.startsWith("BC")){
- year = dateWithYear.substring(0,6);
- }
- else{
- year = dateWithYear.substring(0,4);
- }
- if (op.equals("MINUS")) {
- diff = diff * (-1);
- }
- String yearNew = DateCalculator.getXNextYear(dateWithYear, diff);
- String rest = dateWithYear.substring(4);
- valueNew = valueNew.replace(checkUndef, yearNew+rest);
- }
- }
-
-
- // REF and this are handled here
- if (unit.equals("century")) {
- if ((documentTypeNews|documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) {
- int century = dctCentury;
- if (op.equals("MINUS")) {
- century = dctCentury - diff;
- } else if (op.equals("PLUS")) {
- century = dctCentury + diff;
- }
- valueNew = valueNew.replace(checkUndef, century+"");
- } else {
- String lmCentury = ContextAnalyzer.getLastMentionedX(linearDates, i, "century", language);
- if (lmCentury.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XX");
- } else {
- if (op.equals("MINUS")) {
- diff = (-1) * diff;
- }
- lmCentury = DateCalculator.getXNextCentury(lmCentury, diff);
- valueNew = valueNew.replace(checkUndef, lmCentury);
- }
- }
- } else if (unit.equals("decade")) {
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) {
- int dctDecadeLong = Integer.parseInt(dctCentury + "" + dctDecade);
- int decade = dctDecadeLong;
- if (op.equals("MINUS")) {
- decade = dctDecadeLong - diff;
- } else if (op.equals("PLUS")) {
- decade = dctDecadeLong + diff;
- }
- valueNew = valueNew.replace(checkUndef, decade+"X");
- } else {
- String lmDecade = ContextAnalyzer.getLastMentionedX(linearDates, i, "decade", language);
- if (lmDecade.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXX");
- } else {
- if (op.equals("MINUS")) {
- diff = (-1) * diff;
- }
- lmDecade = DateCalculator.getXNextDecade(lmDecade, diff);
- valueNew = valueNew.replace(checkUndef, lmDecade);
- }
- }
- } else if (unit.equals("year")) {
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) {
- int intValue = dctYear;
- if (op.equals("MINUS")) {
- intValue = dctYear - diff;
- } else if (op.equals("PLUS")) {
- intValue = dctYear + diff;
- }
- valueNew = valueNew.replace(checkUndef, intValue + "");
- } else {
- String lmYear = ContextAnalyzer.getLastMentionedX(linearDates, i, "year", language);
- if (lmYear.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX");
- } else {
- if (op.equals("MINUS")) {
- diff = (-1) * diff;
- }
- lmYear = DateCalculator.getXNextYear(lmYear, diff);
- valueNew = valueNew.replace(checkUndef, lmYear);
- }
- }
- // TODO BC years
- } else if (unit.equals("quarter")) {
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) {
- int intYear = dctYear;
- int intQuarter = Integer.parseInt(dctQuarter.substring(1));
- int diffQuarters = diff % 4;
- diff = diff - diffQuarters;
- int diffYears = diff / 4;
- if (op.equals("MINUS")) {
- diffQuarters = diffQuarters * (-1);
- diffYears = diffYears * (-1);
- }
- intYear = intYear + diffYears;
- intQuarter = intQuarter + diffQuarters;
- valueNew = valueNew.replace(checkUndef, intYear+"-Q"+intQuarter);
- } else {
- String lmQuarter = ContextAnalyzer.getLastMentionedX(linearDates, i, "quarter", language);
- if (lmQuarter.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX");
- } else {
- int intYear = Integer.parseInt(lmQuarter.substring(0, 4));
- int intQuarter = Integer.parseInt(lmQuarter.substring(6));
- int diffQuarters = diff % 4;
- diff = diff - diffQuarters;
- int diffYears = diff / 4;
- if (op.equals("MINUS")) {
- diffQuarters = diffQuarters * (-1);
- diffYears = diffYears * (-1);
- }
- intYear = intYear + diffYears;
- intQuarter = intQuarter + diffQuarters;
- valueNew = valueNew.replace(checkUndef, intYear+"-Q"+intQuarter);
- }
- }
- } else if (unit.equals("month")) {
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) {
- if (op.equals("MINUS")) {
- diff = diff * (-1);
- }
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(dctYear + "-" + norm.getFromNormNumber(dctMonth+""), diff));
- } else {
- String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates, i, "month", language);
- if (lmMonth.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX");
- } else {
- if (op.equals("MINUS")) {
- diff = diff * (-1);
- }
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(lmMonth, diff));
- }
- }
- } else if (unit.equals("week")) {
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) {
- if (op.equals("MINUS")) {
- diff = diff * (-1);
- } else if (op.equals("PLUS")) {
- // diff = diff * 7;
- }
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextWeek(dctYear+"-W"+norm.getFromNormNumber(dctWeek+""), diff, language));
- } else {
- String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language);
- if (lmDay.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX");
- } else {
- if (op.equals("MINUS")) {
- diff = diff * 7 * (-1);
- } else if (op.equals("PLUS")) {
- diff = diff * 7;
- }
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff));
- }
- }
- } else if (unit.equals("day")) {
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable) && (ltn.equals("this"))) {
- if (op.equals("MINUS")) {
- diff = diff * (-1);
- }
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + norm.getFromNormNumber(dctMonth+"") + "-" + dctDay, diff));
- } else {
- String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language);
- if (lmDay.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX");
- } else {
- if (op.equals("MINUS")) {
- diff = diff * (-1);
- }
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff));
- }
- }
- }
- }
- }
- }
-
- // century
- else if (ambigString.startsWith("UNDEF-last-century")) {
- String checkUndef = "UNDEF-last-century";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, norm.getFromNormNumber(dctCentury - 1 +""));
- } else {
- String lmCentury = ContextAnalyzer.getLastMentionedX(linearDates,i,"century", language);
- if (lmCentury.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XX");
- }
- else {
- lmCentury = DateCalculator.getXNextCentury(lmCentury, -1);
- valueNew = valueNew.replace(checkUndef, lmCentury);
- }
- }
- } else if (ambigString.startsWith("UNDEF-this-century")) {
- String checkUndef = "UNDEF-this-century";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, norm.getFromNormNumber(dctCentury+""));
- } else {
- String lmCentury = ContextAnalyzer.getLastMentionedX(linearDates,i,"century", language);
- if (lmCentury.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XX");
- } else {
- valueNew = valueNew.replace(checkUndef, lmCentury);
- }
- }
- } else if (ambigString.startsWith("UNDEF-next-century")) {
- String checkUndef = "UNDEF-next-century";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, norm.getFromNormNumber(dctCentury + 1+""));
- } else {
- String lmCentury = ContextAnalyzer.getLastMentionedX(linearDates,i,"century", language);
- if (lmCentury.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XX");
- } else {
- lmCentury = DateCalculator.getXNextCentury(lmCentury, +1);
- valueNew = valueNew.replace(checkUndef, lmCentury);
- }
- }
- }
-
- // decade
- else if (ambigString.startsWith("UNDEF-last-decade")) {
- String checkUndef = "UNDEF-last-decade";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, (dctYear - 10+"").substring(0,3));
- } else {
- String lmDecade = ContextAnalyzer.getLastMentionedX(linearDates,i,"decade", language);
- if (lmDecade.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX");
- } else {
- lmDecade = DateCalculator.getXNextDecade(lmDecade, -1);
- valueNew = valueNew.replace(checkUndef, lmDecade);
- }
- }
- } else if (ambigString.startsWith("UNDEF-this-decade")) {
- String checkUndef = "UNDEF-this-decade";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, (dctYear+"").substring(0,3));
- } else {
- String lmDecade = ContextAnalyzer.getLastMentionedX(linearDates,i,"decade", language);
- if (lmDecade.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX");
- } else {
- valueNew = valueNew.replace(checkUndef, lmDecade);
- }
- }
- } else if (ambigString.startsWith("UNDEF-next-decade")) {
- String checkUndef = "UNDEF-next-decade";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, (dctYear + 10+"").substring(0,3));
- } else {
- String lmDecade = ContextAnalyzer.getLastMentionedX(linearDates,i,"decade", language);
- if (lmDecade.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX");
- } else {
- lmDecade = DateCalculator.getXNextDecade(lmDecade, 1);
- valueNew = valueNew.replace(checkUndef, lmDecade);
- }
- }
- }
-
- // year
- else if (ambigString.startsWith("UNDEF-last-year")) {
- String checkUndef = "UNDEF-last-year";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, dctYear -1 +"");
- } else {
- String lmYear = ContextAnalyzer.getLastMentionedX(linearDates,i,"year", language);
- if (lmYear.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX");
- } else {
- lmYear = DateCalculator.getXNextYear(lmYear, -1);
- valueNew = valueNew.replace(checkUndef, lmYear);
- }
- }
- if (valueNew.endsWith("-FY")){
- valueNew = "FY" + valueNew.substring(0, Math.min(valueNew.length(), 4));
- }
- } else if (ambigString.startsWith("UNDEF-this-year")) {
- String checkUndef = "UNDEF-this-year";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, dctYear +"");
- } else {
- String lmYear = ContextAnalyzer.getLastMentionedX(linearDates,i,"year", language);
- if (lmYear.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX");
- } else {
- valueNew = valueNew.replace(checkUndef, lmYear);
- }
- }
- if (valueNew.endsWith("-FY")){
- valueNew = "FY" + valueNew.substring(0, Math.min(valueNew.length(), 4));
- }
- } else if (ambigString.startsWith("UNDEF-next-year")) {
- String checkUndef = "UNDEF-next-year";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, dctYear +1 +"");
- } else {
- String lmYear = ContextAnalyzer.getLastMentionedX(linearDates,i,"year", language);
- if (lmYear.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX");
- } else {
- lmYear = DateCalculator.getXNextYear(lmYear, 1);
- valueNew = valueNew.replace(checkUndef, lmYear);
- }
- }
- if (valueNew.endsWith("-FY")){
- valueNew = "FY" + valueNew.substring(0, Math.min(valueNew.length(), 4));
- }
- }
-
- // month
- else if (ambigString.startsWith("UNDEF-last-month")) {
- String checkUndef = "UNDEF-last-month";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(dctYear + "-" + norm.getFromNormNumber(dctMonth+""), -1));
- } else {
- String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates,i,"month", language);
- if (lmMonth.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX");
- } else {
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(lmMonth, -1));
- }
- }
- } else if (ambigString.startsWith("UNDEF-this-month")) {
- String checkUndef = "UNDEF-this-month";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, dctYear + "-" + norm.getFromNormNumber(dctMonth+""));
- } else {
- String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates,i,"month", language);
- if (lmMonth.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX");
- } else {
- valueNew = valueNew.replace(checkUndef, lmMonth);
- }
- }
- }
- else if (ambigString.startsWith("UNDEF-next-month")) {
- String checkUndef = "UNDEF-next-month";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(dctYear + "-" + norm.getFromNormNumber(dctMonth+""), 1));
- } else {
- String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates,i,"month", language);
- if (lmMonth.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX");
- } else {
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextMonth(lmMonth, 1));
- }
- }
- }
-
- // day
- else if (ambigString.startsWith("UNDEF-last-day")) {
- String checkUndef = "UNDEF-last-day";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + norm.getFromNormNumber(dctMonth+"") + "-"+ dctDay, -1));
- } else {
- String lmDay = ContextAnalyzer.getLastMentionedX(linearDates,i,"day", language);
- if (lmDay.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX");
- } else {
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay,-1));
- }
- }
- } else if (ambigString.startsWith("UNDEF-this-day")) {
- String checkUndef = "UNDEF-this-day";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, dctYear + "-" + norm.getFromNormNumber(dctMonth+"") + "-"+ norm.getFromNormNumber(dctDay+""));
- } else {
- String lmDay = ContextAnalyzer.getLastMentionedX(linearDates,i,"day", language);
- if (lmDay.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX");
- } else {
- valueNew = valueNew.replace(checkUndef, lmDay);
- }
- if (ambigString.equals("UNDEF-this-day")) {
- valueNew = "PRESENT_REF";
- }
- }
- }
- else if (ambigString.startsWith("UNDEF-next-day")) {
- String checkUndef = "UNDEF-next-day";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + norm.getFromNormNumber(dctMonth+"") + "-"+ dctDay, 1));
- } else {
- String lmDay = ContextAnalyzer.getLastMentionedX(linearDates,i,"day", language);
- if (lmDay.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX");
- } else {
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay,1));
- }
- }
- }
-
- // week
- else if (ambigString.startsWith("UNDEF-last-week")) {
- String checkUndef = "UNDEF-last-week";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextWeek(dctYear+"-W"+norm.getFromNormNumber(dctWeek+""),-1, language));
- } else {
- String lmWeek = ContextAnalyzer.getLastMentionedX(linearDates,i,"week", language);
- if (lmWeek.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-WXX");
- } else {
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextWeek(lmWeek,-1, language));
- }
- }
- } else if (ambigString.startsWith("UNDEF-this-week")) {
- String checkUndef = "UNDEF-this-week";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef,dctYear+"-W"+norm.getFromNormNumber(dctWeek+""));
- } else {
- String lmWeek = ContextAnalyzer.getLastMentionedX(linearDates,i,"week", language);
- if (lmWeek.equals("")) {
- valueNew = valueNew.replace(checkUndef,"XXXX-WXX");
- } else {
- valueNew = valueNew.replace(checkUndef,lmWeek);
- }
- }
- } else if (ambigString.startsWith("UNDEF-next-week")) {
- String checkUndef = "UNDEF-next-week";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextWeek(dctYear+"-W"+norm.getFromNormNumber(dctWeek+""),1, language));
- } else {
- String lmWeek = ContextAnalyzer.getLastMentionedX(linearDates,i,"week", language);
- if (lmWeek.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-WXX");
- } else {
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextWeek(lmWeek,1, language));
- }
- }
- }
-
- // quarter
- else if (ambigString.startsWith("UNDEF-last-quarter")) {
- String checkUndef = "UNDEF-last-quarter";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- if (dctQuarter.equals("Q1")) {
- valueNew = valueNew.replace(checkUndef, dctYear-1+"-Q4");
- } else {
- int newQuarter = Integer.parseInt(dctQuarter.substring(1,2))-1;
- valueNew = valueNew.replace(checkUndef, dctYear+"-Q"+newQuarter);
- }
- } else {
- String lmQuarter = ContextAnalyzer.getLastMentionedX(linearDates, i, "quarter", language);
- if (lmQuarter.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-QX");
- } else {
- int lmQuarterOnly = Integer.parseInt(lmQuarter.substring(6,7));
- int lmYearOnly = Integer.parseInt(lmQuarter.substring(0,4));
- if (lmQuarterOnly == 1) {
- valueNew = valueNew.replace(checkUndef, lmYearOnly-1+"-Q4");
- } else {
- int newQuarter = lmQuarterOnly-1;
- valueNew = valueNew.replace(checkUndef, lmYearOnly+"-Q"+newQuarter);
- }
- }
- }
- } else if (ambigString.startsWith("UNDEF-this-quarter")) {
- String checkUndef = "UNDEF-this-quarter";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, dctYear+"-"+dctQuarter);
- } else {
- String lmQuarter = ContextAnalyzer.getLastMentionedX(linearDates, i, "quarter", language);
- if (lmQuarter.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-QX");
- } else {
- valueNew = valueNew.replace(checkUndef, lmQuarter);
- }
- }
- } else if (ambigString.startsWith("UNDEF-next-quarter")) {
- String checkUndef = "UNDEF-next-quarter";
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- if (dctQuarter.equals("Q4")) {
- valueNew = valueNew.replace(checkUndef, dctYear+1+"-Q1");
- } else {
- int newQuarter = Integer.parseInt(dctQuarter.substring(1,2))+1;
- valueNew = valueNew.replace(checkUndef, dctYear+"-Q"+newQuarter);
- }
- } else {
- String lmQuarter = ContextAnalyzer.getLastMentionedX(linearDates, i, "quarter", language);
- if (lmQuarter.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-QX");
- } else {
- int lmQuarterOnly = Integer.parseInt(lmQuarter.substring(6,7));
- int lmYearOnly = Integer.parseInt(lmQuarter.substring(0,4));
- if (lmQuarterOnly == 4) {
- valueNew = valueNew.replace(checkUndef, lmYearOnly+1+"-Q1");
- } else {
- int newQuarter = lmQuarterOnly+1;
- valueNew = valueNew.replace(checkUndef, lmYearOnly+"-Q"+newQuarter);
- }
- }
- }
- }
-
- // MONTH NAMES
- else if (ambigString.matches("UNDEF-(last|this|next)-(january|february|march|april|may|june|july|august|september|october|november|december).*")) {
- for (MatchResult mr : Toolbox.findMatches(Pattern.compile("(UNDEF-(last|this|next)-(january|february|march|april|may|june|july|august|september|october|november|december))(.*)"),ambigString)) {
- String rest = mr.group(4);
- int day = 0;
- for (MatchResult mr_rest : Toolbox.findMatches(Pattern.compile("-([0-9][0-9])"),rest)){
- day = Integer.parseInt(mr_rest.group(1));
- }
- String checkUndef = mr.group(1);
- String ltn = mr.group(2);
- String newMonth = norm.getFromNormMonthName((mr.group(3)));
- int newMonthInt = Integer.parseInt(newMonth);
- if (ltn.equals("last")) {
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- // check day if dct-month and newMonth are equal
- if ((dctMonth == newMonthInt) && (!(day == 0))){
- if (dctDay > day){
- valueNew = valueNew.replace(checkUndef, dctYear+"-"+newMonth);
- }
- else{
- valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newMonth);
- }
- }
- else if (dctMonth <= newMonthInt) {
- valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newMonth);
- } else {
- valueNew = valueNew.replace(checkUndef, dctYear+"-"+newMonth);
- }
- } else {
- String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates, i, "month-with-details", language);
- if (lmMonth.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX");
- } else {
- int lmMonthInt = Integer.parseInt(lmMonth.substring(5,7));
- //
- int lmDayInt = 0;
- if ((lmMonth.length() > 9) && (lmMonth.subSequence(8,10).toString().matches("\\d\\d"))){
- lmDayInt = Integer.parseInt(lmMonth.subSequence(8,10)+"");
- }
- if ((lmMonthInt == newMonthInt) && (!(lmDayInt == 0)) && (!(day == 0))){
- if (lmDayInt > day){
- valueNew = valueNew.replace(checkUndef, lmMonth.substring(0,4)+"-"+newMonth);
- }
- else{
- valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmMonth.substring(0,4))-1+"-"+newMonth);
- }
- }
- if (lmMonthInt <= newMonthInt) {
- valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmMonth.substring(0,4))-1+"-"+newMonth);
- } else {
- valueNew = valueNew.replace(checkUndef, lmMonth.substring(0,4)+"-"+newMonth);
- }
- }
- }
- } else if (ltn.equals("this")) {
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- valueNew = valueNew.replace(checkUndef, dctYear+"-"+newMonth);
- } else {
- String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates, i, "month-with-details", language);
- if (lmMonth.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX");
- } else {
- valueNew = valueNew.replace(checkUndef, lmMonth.substring(0,4)+"-"+newMonth);
- }
- }
- } else if (ltn.equals("next")) {
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- // check day if dct-month and newMonth are equal
- if ((dctMonth == newMonthInt) && (!(day == 0))){
- if (dctDay < day){
- valueNew = valueNew.replace(checkUndef, dctYear+"-"+newMonth);
- }
- else{
- valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newMonth);
- }
- }
- else if (dctMonth >= newMonthInt) {
- valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newMonth);
- } else {
- valueNew = valueNew.replace(checkUndef, dctYear+"-"+newMonth);
- }
- } else {
- String lmMonth = ContextAnalyzer.getLastMentionedX(linearDates, i, "month-with-details", language);
- if (lmMonth.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX");
- } else {
- int lmMonthInt = Integer.parseInt(lmMonth.substring(5,7));
- if (lmMonthInt >= newMonthInt) {
- valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmMonth.substring(0,4))+1+"-"+newMonth);
- } else {
- valueNew = valueNew.replace(checkUndef, lmMonth.substring(0,4)+"-"+newMonth);
- }
- }
- }
- }
- }
- }
-
- // SEASONS NAMES
- else if (ambigString.matches("^UNDEF-(last|this|next)-(SP|SU|FA|WI).*")) {
- for (MatchResult mr : Toolbox.findMatches(Pattern.compile("(UNDEF-(last|this|next)-(SP|SU|FA|WI)).*"),ambigString)) {
- String checkUndef = mr.group(1);
- String ltn = mr.group(2);
- String newSeason = mr.group(3);
- if (ltn.equals("last")) {
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- if (dctSeason.equals("SP")) {
- valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newSeason);
- } else if (dctSeason.equals("SU")) {
- if (newSeason.equals("SP")) {
- valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason);
- } else {
- valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newSeason);
- }
- } else if (dctSeason.equals("FA")) {
- if ((newSeason.equals("SP")) || (newSeason.equals("SU"))) {
- valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason);
- } else {
- valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newSeason);
- }
- } else if (dctSeason.equals("WI")) {
- if (newSeason.equals("WI")) {
- valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newSeason);
- } else {
- if (dctMonth < 12){
- valueNew = valueNew.replace(checkUndef, dctYear-1+"-"+newSeason);
- }
- else{
- valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason);
- }
- }
- }
- } else { // NARRATVIE DOCUMENT
- String lmSeason = ContextAnalyzer.getLastMentionedX(linearDates, i, "season", language);
- if (lmSeason.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX");
- } else {
- if (lmSeason.substring(5,7).equals("SP")) {
- valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))-1+"-"+newSeason);
- } else if (lmSeason.substring(5,7).equals("SU")) {
- if (lmSeason.substring(5,7).equals("SP")) {
- valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason);
- } else {
- valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))-1+"-"+newSeason);
- }
- } else if (lmSeason.substring(5,7).equals("FA")) {
- if ((newSeason.equals("SP")) || (newSeason.equals("SU"))) {
- valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason);
- } else {
- valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))-1+"-"+newSeason);
- }
- } else if (lmSeason.substring(5,7).equals("WI")) {
- if (newSeason.equals("WI")) {
- valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))-1+"-"+newSeason);
- } else {
- valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason);
- }
- }
- }
- }
- } else if (ltn.equals("this")) {
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- // TODO include tense of sentence?
- valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason);
- } else {
- // TODO include tense of sentence?
- String lmSeason = ContextAnalyzer.getLastMentionedX(linearDates, i, "season", language);
- if (lmSeason.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX");
- } else {
- valueNew = valueNew.replace(checkUndef, lmSeason.substring(0,4)+"-"+newSeason);
- }
- }
- } else if (ltn.equals("next")) {
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- if (dctSeason.equals("SP")) {
- if (newSeason.equals("SP")) {
- valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newSeason);
- } else {
- valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason);
- }
- } else if (dctSeason.equals("SU")) {
- if ((newSeason.equals("SP")) || (newSeason.equals("SU"))) {
- valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newSeason);
- } else {
- valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason);
- }
- } else if (dctSeason.equals("FA")) {
- if (newSeason.equals("WI")) {
- valueNew = valueNew.replace(checkUndef, dctYear+"-"+newSeason);
- } else {
- valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newSeason);
- }
- } else if (dctSeason.equals("WI")) {
- valueNew = valueNew.replace(checkUndef, dctYear+1+"-"+newSeason);
- }
- } else { // NARRATIVE DOCUMENT
- String lmSeason = ContextAnalyzer.getLastMentionedX(linearDates, i, "season", language);
- if (lmSeason.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX");
- } else {
- if (lmSeason.substring(5,7).equals("SP")) {
- if (newSeason.equals("SP")) {
- valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+1+"-"+newSeason);
- } else {
- valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason);
- }
- } else if (lmSeason.substring(5,7).equals("SU")) {
- if ((newSeason.equals("SP")) || (newSeason.equals("SU"))) {
- valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+1+"-"+newSeason);
- } else {
- valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason);
- }
- } else if (lmSeason.substring(5,7).equals("FA")) {
- if (newSeason.equals("WI")) {
- valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+"-"+newSeason);
- } else {
- valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+1+"-"+newSeason);
- }
- } else if (lmSeason.substring(5,7).equals("WI")) {
- valueNew = valueNew.replace(checkUndef, Integer.parseInt(lmSeason.substring(0,4))+1+"-"+newSeason);
- }
- }
- }
- }
- }
- }
-
- // WEEKDAY NAMES
- // TODO the calculation is strange, but works
- // TODO tense should be included?!
- else if (ambigString.matches("^UNDEF-(last|this|next|day)-(monday|tuesday|wednesday|thursday|friday|saturday|sunday).*")) {
- for (MatchResult mr : Toolbox.findMatches(Pattern.compile("(UNDEF-(last|this|next|day)-(monday|tuesday|wednesday|thursday|friday|saturday|sunday)).*"),ambigString)) {
- String checkUndef = mr.group(1);
- String ltnd = mr.group(2);
- String newWeekday = mr.group(3);
- int newWeekdayInt = Integer.parseInt(norm.getFromNormDayInWeek(newWeekday));
- if (ltnd.equals("last")) {
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- int diff = (-1) * (dctWeekday - newWeekdayInt);
- if (diff >= 0) {
- diff = diff - 7;
- }
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + dctMonth + "-" + dctDay, diff));
- } else {
- String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language);
- if (lmDay.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX");
- } else {
- int lmWeekdayInt = DateCalculator.getWeekdayOfDate(lmDay);
- int diff = (-1) * (lmWeekdayInt - newWeekdayInt);
- if (diff >= 0) {
- diff = diff - 7;
- }
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff));
- }
- }
- } else if (ltnd.equals("this")) {
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- // TODO tense should be included?!
- int diff = (-1) * (dctWeekday - newWeekdayInt);
- if (diff >= 0) {
- diff = diff - 7;
- }
- if (diff == -7) {
- diff = 0;
- }
-
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + dctMonth + "-"+ dctDay, diff));
- } else {
- // TODO tense should be included?!
- String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language);
- if (lmDay.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX");
- } else {
- int lmWeekdayInt = DateCalculator.getWeekdayOfDate(lmDay);
- int diff = (-1) * (lmWeekdayInt - newWeekdayInt);
- if (diff >= 0) {
- diff = diff - 7;
- }
- if (diff == -7) {
- diff = 0;
- }
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff));
- }
- }
- } else if (ltnd.equals("next")) {
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- int diff = newWeekdayInt - dctWeekday;
- if (diff <= 0) {
- diff = diff + 7;
- }
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + dctMonth + "-"+ dctDay, diff));
- } else {
- String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language);
- if (lmDay.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX");
- } else {
- int lmWeekdayInt = DateCalculator.getWeekdayOfDate(lmDay);
- int diff = newWeekdayInt - lmWeekdayInt;
- if (diff <= 0) {
- diff = diff + 7;
- }
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff));
- }
- }
- } else if (ltnd.equals("day")) {
- if ((documentTypeNews||documentTypeColloquial||documentTypeScientific) && (dctAvailable)) {
- // TODO tense should be included?!
- int diff = (-1) * (dctWeekday - newWeekdayInt);
- if (diff >= 0) {
- diff = diff - 7;
- }
- if (diff == -7) {
- diff = 0;
- }
- // Tense is FUTURE
- if ((last_used_tense.equals("FUTURE")) && diff != 0) {
- diff = diff + 7;
- }
- // Tense is PAST
- if ((last_used_tense.equals("PAST"))) {
-
- }
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(dctYear + "-" + dctMonth + "-"+ dctDay, diff));
- } else {
- // TODO tense should be included?!
- String lmDay = ContextAnalyzer.getLastMentionedX(linearDates, i, "day", language);
- if (lmDay.equals("")) {
- valueNew = valueNew.replace(checkUndef, "XXXX-XX-XX");
- } else {
- int lmWeekdayInt = DateCalculator.getWeekdayOfDate(lmDay);
- int diff = (-1) * (lmWeekdayInt - newWeekdayInt);
- if (diff >= 0) {
- diff = diff - 7;
- }
- if (diff == -7) {
- diff = 0;
- }
- valueNew = valueNew.replace(checkUndef, DateCalculator.getXNextDay(lmDay, diff));
- }
- }
- }
- }
-
- } else {
- Logger.printDetail(component, "ATTENTION: UNDEF value for: " + valueNew+" is not handled in disambiguation phase!");
- }
- }
-
- return valueNew;
- }
-
- /**
- * Under-specified values are disambiguated here. Only Timexes of types "date" and "time" can be under-specified.
- * @param jcas
- */
- public void specifyAmbiguousValues(JCas jcas) {
- // build up a list with all found TIMEX expressions
- List linearDates = new ArrayList();
- FSIterator iterTimex = jcas.getAnnotationIndex(Timex3.type).iterator();
-
- // Create List of all Timexes of types "date" and "time"
- while (iterTimex.hasNext()) {
- Timex3 timex = (Timex3) iterTimex.next();
- if (timex.getTimexType().equals("DATE") || timex.getTimexType().equals("TIME")) {
- linearDates.add(timex);
- }
-
- if(timex.getTimexType().equals("DURATION") && !timex.getEmptyValue().equals("")) {
- linearDates.add(timex);
- }
- }
-
- //////////////////////////////////////////////
- // go through list of Date and Time timexes //
- //////////////////////////////////////////////
- for (int i = 0; i < linearDates.size(); i++) {
- Timex3 t_i = (Timex3) linearDates.get(i);
- String value_i = t_i.getTimexValue();
-
- String valueNew = value_i;
- // handle the value attribute only if we have a TIME or DATE
- if(t_i.getTimexType().equals("TIME") || t_i.getTimexType().equals("DATE"))
- valueNew = specifyAmbiguousValuesString(value_i, t_i, i, linearDates, jcas);
-
- // handle the emptyValue attribute for any type
- if(t_i.getEmptyValue() != null && t_i.getEmptyValue().length() > 0) {
- String emptyValueNew = specifyAmbiguousValuesString(t_i.getEmptyValue(), t_i, i, linearDates, jcas);
- t_i.setEmptyValue(emptyValueNew);
- }
-
- t_i.removeFromIndexes();
- Logger.printDetail(t_i.getTimexId()+" DISAMBIGUATION PHASE: foundBy:"+t_i.getFoundByRule()+" text:"+t_i.getCoveredText()+" value:"+t_i.getTimexValue()+" NEW value:"+valueNew);
-
- t_i.setTimexValue(valueNew);
- t_i.addToIndexes();
- linearDates.set(i, t_i);
- }
- }
-
-
- /**
- * @param jcas
- */
- private void deleteOverlappedTimexesPreprocessing(JCas jcas) {
- FSIterator timexIter1 = jcas.getAnnotationIndex(Timex3.type).iterator();
- HashSet hsTimexesToRemove = new HashSet();
- while (timexIter1.hasNext()) {
- Timex3 t1 = (Timex3) timexIter1.next();
- FSIterator timexIter2 = jcas.getAnnotationIndex(Timex3.type).iterator();
-
- while (timexIter2.hasNext()) {
- Timex3 t2 = (Timex3) timexIter2.next();
- if (((t1.getBegin() >= t2.getBegin()) && (t1.getEnd() < t2.getEnd())) || // t1 starts inside or with t2 and ends before t2 -> remove t1
- ((t1.getBegin() > t2.getBegin()) && (t1.getEnd() <= t2.getEnd()))) { // t1 starts inside t2 and ends with or before t2 -> remove t1
- hsTimexesToRemove.add(t1);
- }
- else if (((t2.getBegin() >= t1.getBegin()) && (t2.getEnd() < t1.getEnd())) || // t2 starts inside or with t1 and ends before t1 -> remove t2
- ((t2.getBegin() > t1.getBegin()) && (t2.getEnd() <= t1.getEnd()))) { // t2 starts inside t1 and ends with or before t1 -> remove t2
- hsTimexesToRemove.add(t2);
- }
- // identical length
- if (!t1.equals(t2) && (t1.getBegin() == t2.getBegin()) && (t1.getEnd() == t2.getEnd())) {
- if ((t1.getTimexValue().startsWith("UNDEF")) && (!(t2.getTimexValue().startsWith("UNDEF")))) {
- hsTimexesToRemove.add(t1);
- }
- else if ((!(t1.getTimexValue().startsWith("UNDEF"))) && (t2.getTimexValue().startsWith("UNDEF"))) {
- hsTimexesToRemove.add(t2);
- }
- // t1 is explicit, but t2 is not
- else if ((t1.getFoundByRule().endsWith("explicit")) && (!(t2.getFoundByRule().endsWith("explicit")))) {
- hsTimexesToRemove.add(t2);
- }
- // remove timexes that are identical, but one has an emptyvalue
- else if(t2.getEmptyValue().equals("") && !t1.getEmptyValue().equals("")) {
- hsTimexesToRemove.add(t2);
- }
- // REMOVE REAL DUPLICATES (the one with the lower timexID)
- else if ((Integer.parseInt(t1.getTimexId().substring(1)) < Integer.parseInt(t2.getTimexId().substring(1)))) {
- hsTimexesToRemove.add(t1);
- }
- }
- }
- }
- // remove, finally
- for (Timex3 t : hsTimexesToRemove) {
- Logger.printDetail("REMOVE DUPLICATE: " + t.getCoveredText()+"(id:"+t.getTimexId()+" value:"+t.getTimexValue()+" found by:"+t.getFoundByRule()+")");
-
- t.removeFromIndexes();
- timex_counter--;
- }
- }
-
- private void deleteOverlappedTimexesPostprocessing(JCas jcas) {
- FSIterator timexIter = jcas.getAnnotationIndex(Timex3.type).iterator();
- FSIterator innerTimexIter = timexIter.copy();
- HashSet> effectivelyToInspect = new HashSet>();
- ArrayList allTimexesToInspect = new ArrayList();
- while(timexIter.hasNext()) {
- Timex3 myTimex = (Timex3) timexIter.next();
-
- ArrayList timexSet = new ArrayList();
- if (!(myTimex.getTimexType().equals("TEMPONYM"))) {
- timexSet.add(myTimex);
- }
-
- // compare this timex to all other timexes and mark those that have an overlap
- while(innerTimexIter.hasNext()) {
- Timex3 myInnerTimex = (Timex3) innerTimexIter.next();
- if (!(myTimex.getTimexType().equals("TEMPONYM"))) {
- if((myTimex.getBegin() <= myInnerTimex.getBegin() && myTimex.getEnd() > myInnerTimex.getBegin()) || // timex1 starts, timex2 is partial overlap
- (myInnerTimex.getBegin() <= myTimex.getBegin() && myInnerTimex.getEnd() > myTimex.getBegin()) || // same as above, but in reverse
- (myInnerTimex.getBegin() <= myTimex.getBegin() && myTimex.getEnd() <= myInnerTimex.getEnd()) || // timex 1 is contained within or identical to timex2
- (myTimex.getBegin() <= myInnerTimex.getBegin() && myInnerTimex.getEnd() <= myTimex.getEnd())) { // same as above, but in reverse
- timexSet.add(myInnerTimex); // increase the set
-
- allTimexesToInspect.add(myTimex); // note that these timexes are being looked at
- allTimexesToInspect.add(myInnerTimex);
- }
- }
- }
-
- // if overlaps with myTimex were detected, memorize them
- if(timexSet.size() > 1)
- effectivelyToInspect.add(timexSet);
-
- // reset the inner iterator
- innerTimexIter.moveToFirst();
- }
-
- /* prune those sets of overlapping timexes that are subsets of others
- * (i.e. leave only the largest union of overlapping timexes)
- */
- HashSet> newEffectivelyToInspect = new HashSet>();
- for(Timex3 t : allTimexesToInspect) {
- ArrayList setToKeep = new ArrayList();
-
- // determine the largest set that contains this timex
- for(ArrayList tSet : effectivelyToInspect) {
- if(tSet.contains(t) && tSet.size() > setToKeep.size())
- setToKeep = tSet;
- }
-
- newEffectivelyToInspect.add(setToKeep);
- }
- // overwrite previous list of sets
- effectivelyToInspect = newEffectivelyToInspect;
-
- // iterate over the selected sets and merge information, remove old timexes
- for(ArrayList tSet : effectivelyToInspect) {
- Timex3 newTimex = new Timex3(jcas);
-
- // if a timex has the timex value REMOVE, remove it from consideration
- @SuppressWarnings("unchecked")
- ArrayList newTSet = (ArrayList) tSet.clone();
- for(Timex3 t : tSet) {
- if(t.getTimexValue().equals("REMOVE")) { // remove timexes with value "REMOVE"
- newTSet.remove(t);
- }
- }
- tSet = newTSet;
-
- // iteration is done if all the timexes have been removed, i.e. the set is empty
- if(tSet.size() == 0)
- continue;
-
- /*
- * check
- * - whether all timexes of this set have the same timex type attribute,
- * - which one in the set has the longest value attribute string length,
- * - what the combined extents are
- */
- Boolean allSameTypes = true;
- String timexType = null;
- Timex3 longestTimex = null;
- Integer combinedBegin = Integer.MAX_VALUE, combinedEnd = Integer.MIN_VALUE;
- ArrayList tokenIds = new ArrayList();
- for(Timex3 t : tSet) {
- // check whether the types are identical and either all DATE or TIME
- if(timexType == null) {
- timexType = t.getTimexType();
- } else {
- if(allSameTypes && !timexType.equals(t.getTimexType()) || !(timexType.equals("DATE") || timexType.equals("TIME"))) {
- allSameTypes = false;
- }
- }
- Logger.printDetail("Are these overlapping timexes of same type? => " + allSameTypes);
-
- // check timex value attribute string length
- if(longestTimex == null) {
- longestTimex = t;
- } else if(allSameTypes && t.getFoundByRule().indexOf("-BCADhint") != -1) {
- longestTimex = t;
- } else if(allSameTypes && t.getFoundByRule().indexOf("relative") == -1 && longestTimex.getFoundByRule().indexOf("relative") != -1) {
- longestTimex = t;
- } else if(longestTimex.getTimexValue().length() == t.getTimexValue().length()) {
- if(t.getBegin() < longestTimex.getBegin())
- longestTimex = t;
- } else if(longestTimex.getTimexValue().length() < t.getTimexValue().length()) {
- longestTimex = t;
- }
- Logger.printDetail("Selected " + longestTimex.getTimexId() + ": " + longestTimex.getCoveredText() +
- "[" + longestTimex.getTimexValue() + "] as the longest-valued timex.");
-
- // check combined beginning/end
- if(combinedBegin > t.getBegin())
- combinedBegin = t.getBegin();
- if(combinedEnd < t.getEnd())
- combinedEnd = t.getEnd();
- Logger.printDetail("Selected combined constraints: " + combinedBegin + ":" + combinedEnd);
-
- // disassemble and remember the token ids
- String[] tokenizedTokenIds = t.getAllTokIds().split("<-->");
- for(Integer i = 1; i < tokenizedTokenIds.length; i++) {
- if(!tokenIds.contains(Integer.parseInt(tokenizedTokenIds[i]))) {
- tokenIds.add(Integer.parseInt(tokenizedTokenIds[i]));
- }
- }
- }
-
- /* types are equal => merge constraints, use the longer, "more granular" value.
- * if types are not equal, just take the longest value.
- */
- Collections.sort(tokenIds);
- newTimex = longestTimex;
- if(allSameTypes) {
- newTimex.setBegin(combinedBegin);
- newTimex.setEnd(combinedEnd);
- if(tokenIds.size() > 0)
- newTimex.setFirstTokId(tokenIds.get(0));
- String tokenIdText = "BEGIN";
- for(Integer tokenId : tokenIds) {
- tokenIdText += "<-->" + tokenId;
- }
- newTimex.setAllTokIds(tokenIdText);
- }
-
- // remove old overlaps.
- for(Timex3 t : tSet) {
- t.removeFromIndexes();
- }
- // add the single constructed/chosen timex to the indexes.
- newTimex.addToIndexes();
- }
- }
-
-
/**
* Identify the part of speech (POS) of a MarchResult.
+ *
* @param tokBegin
* @param tokEnd
* @param s
@@ -2129,9 +711,9 @@ private void deleteOverlappedTimexesPostprocessing(JCas jcas) {
public String getPosFromMatchResult(int tokBegin, int tokEnd, Sentence s, JCas jcas) {
// get all tokens in sentence
HashMap hmTokens = new HashMap();
- FSIterator iterTok = jcas.getAnnotationIndex(Token.type).subiterator(s);
- while (iterTok.hasNext()) {
- Token token = (Token) iterTok.next();
+ AnnotationIndex tokens = jcas.getAnnotationIndex(Token.type);
+ for (FSIterator iterTok = tokens.subiterator(s); iterTok.hasNext();) {
+ Token token = iterTok.next();
hmTokens.put(token.getBegin(), token);
}
// get correct token
@@ -2143,424 +725,153 @@ public String getPosFromMatchResult(int tokBegin, int tokEnd, Sentence s, JCas j
return pos;
}
-
+ // pattern for offset information
+ Pattern paOffset = Pattern.compile("group\\(([0-9]+)\\)-group\\(([0-9]+)\\)");
+
/**
* Apply the extraction rules, normalization rules
+ *
* @param timexType
- * @param hmPattern
- * @param hmOffset
- * @param hmNormalization
- * @param s
+ * Type to find
+ * @param sortedRules
+ * sorted rules
+ * @param startpos
+ * Valid starting positions
+ * @param endpos
+ * Valid end positions
+ * @param s
+ * Sentence
* @param jcas
+ * JCas
+ * @param coveredText
+ * covered text
*/
- public void findTimexes(String timexType,
- HashMap hmPattern,
- HashMap hmOffset,
- HashMap hmNormalization,
- Sentence s,
- JCas jcas) {
- RuleManager rm = RuleManager.getInstance(language, find_temponyms);
- HashMap hmDatePosConstraint = rm.getHmDatePosConstraint();
- HashMap hmDurationPosConstraint = rm.getHmDurationPosConstraint();
- HashMap hmTimePosConstraint = rm.getHmTimePosConstraint();
- HashMap hmSetPosConstraint = rm.getHmSetPosConstraint();
- HashMap hmTemponymPosConstraint = rm.getHmTemponymPosConstraint();
-
- // get fast check patterns first
- HashMap hmDateFastCheck = rm.getHmDateFastCheck();
- HashMap hmDurationFastCheck = rm.getHmDurationFastCheck();
- HashMap hmTimeFastCheck = rm.getHmTimeFastCheck();
- HashMap hmSetFastCheck = rm.getHmSetFastCheck();
- HashMap hmTemponymFastCheck = rm.getHmTemponymFastCheck();
- Pattern f = null;
- Boolean fastCheckOK = true;
-
+ public void findTimexes(String timexType, List sortedRules, TokenBoundaryMatcher matcher, Sentence s, JCas jcas, CharSequence coveredText) {
// Iterator over the rules by sorted by the name of the rules
- // this is important since later, the timexId will be used to
+ // this is important since later, the timexId will be used to
// decide which of two expressions shall be removed if both
// have the same offset
- for (Iterator i = Toolbox.sortByValue(hmPattern).iterator(); i.hasNext(); ) {
- Pattern p = (Pattern) i.next();
-
- // validate fast check fist, if no fast match, everything else is not required anymore
- if (timexType.equals("DATE")) {
- f = hmDateFastCheck.get(hmPattern.get(p));
- } else if (timexType.equals("Time")) {
- f = hmTimeFastCheck.get(hmPattern.get(p));
- } else if (timexType.equals("DURATION")) {
- f = hmDurationFastCheck.get(hmPattern.get(p));
- } else if (timexType.equals("SET")) {
- f = hmSetFastCheck.get(hmPattern.get(p));
- } else if (timexType.equals("TEMPONYM")) {
- f = hmTemponymFastCheck.get(hmPattern.get(p));
- }
- if (!(f == null)){
- fastCheckOK = false;
-
- if (f.matcher(s.getCoveredText()).find()) {
- fastCheckOK = true;
- }
- }
-
-
- if (fastCheckOK) {
- for (MatchResult r : Toolbox.findMatches(p, s.getCoveredText())) {
- boolean infrontBehindOK = ContextAnalyzer.checkTokenBoundaries(r, s, jcas) // improved token boundary checking
- && ContextAnalyzer.checkInfrontBehind(r, s);
-
-
- // CHECK POS CONSTRAINTS
- boolean posConstraintOK = true;
-
- if (timexType.equals("DATE")) {
- if (hmDatePosConstraint.containsKey(hmPattern.get(p))) {
- posConstraintOK = checkPosConstraint(s , hmDatePosConstraint.get(hmPattern.get(p)), r, jcas);
- }
- } else if (timexType.equals("DURATION")) {
- if (hmDurationPosConstraint.containsKey(hmPattern.get(p))) {
- posConstraintOK = checkPosConstraint(s , hmDurationPosConstraint.get(hmPattern.get(p)), r, jcas);
- }
- } else if (timexType.equals("TIME")) {
- if (hmTimePosConstraint.containsKey(hmPattern.get(p))) {
- posConstraintOK = checkPosConstraint(s , hmTimePosConstraint.get(hmPattern.get(p)), r, jcas);
- }
- } else if (timexType.equals("SET")) {
- if (hmSetPosConstraint.containsKey(hmPattern.get(p))) {
- posConstraintOK = checkPosConstraint(s , hmSetPosConstraint.get(hmPattern.get(p)), r, jcas);
- }
- } else if (timexType.equals("TEMPONYM")) {
- if (hmTemponymPosConstraint.containsKey(hmPattern.get(p))) {
- posConstraintOK = checkPosConstraint(s , hmSetPosConstraint.get(hmPattern.get(p)), r, jcas);
- }
- }
-
- if ((infrontBehindOK == true) && (posConstraintOK == true)) {
-
- // Offset of timex expression (in the checked sentence)
- int timexStart = r.start();
- int timexEnd = r.end();
-
- // Normalization from Files:
-
- // Any offset parameter?
- if (hmOffset.containsKey(hmPattern.get(p))) {
- String offset = hmOffset.get(hmPattern.get(p));
-
- // pattern for offset information
- Pattern paOffset = Pattern.compile("group\\(([0-9]+)\\)-group\\(([0-9]+)\\)");
- for (MatchResult mr : Toolbox.findMatches(paOffset,offset)) {
- int startOffset = Integer.parseInt(mr.group(1));
- int endOffset = Integer.parseInt(mr.group(2));
- timexStart = r.start(startOffset);
- timexEnd = r.end(endOffset);
- }
- }
-
- // Normalization Parameter
- if (hmNormalization.containsKey(hmPattern.get(p))) {
- String[] attributes = new String[5];
- if (timexType.equals("DATE")) {
- attributes = getAttributesForTimexFromFile(hmPattern.get(p), rm.getHmDateNormalization(), rm.getHmDateQuant(), rm.getHmDateFreq(), rm.getHmDateMod(), rm.getHmDateEmptyValue(), r, jcas);
- } else if (timexType.equals("DURATION")) {
- attributes = getAttributesForTimexFromFile(hmPattern.get(p), rm.getHmDurationNormalization(), rm.getHmDurationQuant(), rm.getHmDurationFreq(), rm.getHmDurationMod(), rm.getHmDurationEmptyValue(), r, jcas);
- } else if (timexType.equals("TIME")) {
- attributes = getAttributesForTimexFromFile(hmPattern.get(p), rm.getHmTimeNormalization(), rm.getHmTimeQuant(), rm.getHmTimeFreq(), rm.getHmTimeMod(), rm.getHmTimeEmptyValue(), r, jcas);
- } else if (timexType.equals("SET")) {
- attributes = getAttributesForTimexFromFile(hmPattern.get(p), rm.getHmSetNormalization(), rm.getHmSetQuant(), rm.getHmSetFreq(), rm.getHmSetMod(), rm.getHmSetEmptyValue(), r, jcas);
- } else if (timexType.equals("TEMPONYM")) {
- attributes = getAttributesForTimexFromFile(hmPattern.get(p), rm.getHmTemponymNormalization(), rm.getHmTemponymQuant(), rm.getHmTemponymFreq(), rm.getHmTemponymMod(), rm.getHmTemponymEmptyValue(), r, jcas);
- }
- if (!(attributes == null)) {
- addTimexAnnotation(timexType, timexStart + s.getBegin(), timexEnd + s.getBegin(), s,
- attributes[0], attributes[1], attributes[2], attributes[3], attributes[4], "t" + timexID++, hmPattern.get(p), jcas);
- }
- }
- else {
- Logger.printError("SOMETHING REALLY WRONG HERE: "+hmPattern.get(p));
- }
+ for (Rule rule : sortedRules) {
+ String key = rule.getName();
+ // validate fast check first, if no fast match, everything else is
+ // not required anymore
+ Pattern f = rule.getFastCheck();
+ if (f != null && matcher.matchNext(0, f.matcher(coveredText), key) < 0)
+ continue;
+
+ Matcher m = rule.getPattern().matcher(coveredText);
+ for (int tpos = 0; (tpos = matcher.matchNext(tpos, m, key)) >= 0;) {
+ // CHECK POS CONSTRAINTS
+ String constraint = rule.getPosConstratint();
+ if (constraint != null && !checkPosConstraint(key, s, constraint, m, jcas))
+ continue;
+ // Offset of timex expression (in the checked sentence)
+ int timexStart = m.start(), timexEnd = m.end();
+
+ // Any offset parameter?
+ String offset = rule.getOffset();
+ if (offset != null) {
+ Matcher mr = paOffset.matcher(offset);
+ if (mr.matches()) {
+ timexStart = m.start(parseInt(mr.group(1)));
+ timexEnd = m.end(parseInt(mr.group(2)));
+ } else {
+ LOG.warn("Offset pattern does not match: {}", offset);
}
}
+
+ // Normalization Parameter
+ if (rule.getNormalization() == null) {
+ LOG.warn("No normalization pattern for: {}", key);
+ continue;
+ }
+ String[] attributes = getAttributesForTimexFromFile(key, rule, m, jcas);
+ if (attributes != null) {
+ addTimexAnnotation(timexType, timexStart + s.getBegin(), timexEnd + s.getBegin(), s, attributes[0], attributes[1], attributes[2], attributes[3], attributes[4],
+ "t" + timexID++, key, jcas);
+ }
}
- fastCheckOK = true;
}
}
-
-
+
+ static Pattern paConstraint = Pattern.compile("group\\(([0-9]+)\\):(.*?):");
+
/**
* Check whether the part of speech constraint defined in a rule is satisfied.
+ *
+ * @param rule
+ * Rule name, for error reporting
* @param s
* @param posConstraint
* @param m
* @param jcas
* @return
*/
- public boolean checkPosConstraint(Sentence s, String posConstraint, MatchResult m, JCas jcas) {
- Pattern paConstraint = Pattern.compile("group\\(([0-9]+)\\):(.*?):");
- for (MatchResult mr : Toolbox.findMatches(paConstraint,posConstraint)) {
- int groupNumber = Integer.parseInt(mr.group(1));
- int tokenBegin = s.getBegin() + m.start(groupNumber);
- int tokenEnd = s.getBegin() + m.end(groupNumber);
- String pos = mr.group(2);
- String pos_as_is = getPosFromMatchResult(tokenBegin, tokenEnd ,s, jcas);
- if (pos_as_is.matches(pos)) {
- Logger.printDetail("POS CONSTRAINT IS VALID: pos should be "+pos+" and is "+pos_as_is);
- } else {
- return false;
+ public boolean checkPosConstraint(String rule, Sentence s, String posConstraint, MatchResult m, JCas jcas) {
+ Matcher mr = paConstraint.matcher(posConstraint);
+ while (mr.find()) {
+ try {
+ int groupNumber = parseInt(mr.group(1));
+ int tokenBegin = s.getBegin() + m.start(groupNumber);
+ int tokenEnd = s.getBegin() + m.end(groupNumber);
+ String pos = mr.group(2);
+ String pos_as_is = getPosFromMatchResult(tokenBegin, tokenEnd, s, jcas);
+ if (!pos_as_is.matches(pos))
+ return false;
+ if (LOG.isTraceEnabled())
+ LOG.trace("POS CONSTRAINT IS VALID: pos should be {} and is {}", pos, pos_as_is);
+ } catch (IndexOutOfBoundsException e) {
+ LOG.debug("Bad group number in rule {}", rule);
}
}
return true;
}
-
-
- public String applyRuleFunctions(String tonormalize, MatchResult m) {
- NormalizationManager norm = NormalizationManager.getInstance(language, find_temponyms);
-
- String normalized = "";
- // pattern for normalization functions + group information
- // pattern for group information
- Pattern paNorm = Pattern.compile("%([A-Za-z0-9]+?)\\(group\\(([0-9]+)\\)\\)");
- Pattern paGroup = Pattern.compile("group\\(([0-9]+)\\)");
- while ((tonormalize.contains("%")) || (tonormalize.contains("group"))) {
- // replace normalization functions
- for (MatchResult mr : Toolbox.findMatches(paNorm,tonormalize)) {
- Logger.printDetail("-----------------------------------");
- Logger.printDetail("DEBUGGING: tonormalize:"+tonormalize);
- Logger.printDetail("DEBUGGING: mr.group():"+mr.group());
- Logger.printDetail("DEBUGGING: mr.group(1):"+mr.group(1));
- Logger.printDetail("DEBUGGING: mr.group(2):"+mr.group(2));
- Logger.printDetail("DEBUGGING: m.group():"+m.group());
- Logger.printDetail("DEBUGGING: m.group("+Integer.parseInt(mr.group(2))+"):"+m.group(Integer.parseInt(mr.group(2))));
- Logger.printDetail("DEBUGGING: hmR...:"+norm.getFromHmAllNormalization(mr.group(1)).get(m.group(Integer.parseInt(mr.group(2)))));
- Logger.printDetail("-----------------------------------");
-
- if (! (m.group(Integer.parseInt(mr.group(2))) == null)) {
- String partToReplace = m.group(Integer.parseInt(mr.group(2))).replaceAll("[\n\\s]+", " ");
- if (!(norm.getFromHmAllNormalization(mr.group(1)).containsKey(partToReplace))) {
- Logger.printDetail("Maybe problem with normalization of the resource: "+mr.group(1));
- Logger.printDetail("Maybe problem with part to replace? "+partToReplace);
- if (mr.group(1).contains("Temponym")){
- Logger.printDetail("Should be ok, as it's a temponym.");
- return null;
- }
- }
- else {
- tonormalize = tonormalize.replace(mr.group(), norm.getFromHmAllNormalization(mr.group(1)).get(partToReplace));
- }
- } else {
- Logger.printDetail("Empty part to normalize in "+mr.group(1));
-
- tonormalize = tonormalize.replace(mr.group(), "");
- }
- }
- // replace other groups
- for (MatchResult mr : Toolbox.findMatches(paGroup,tonormalize)) {
- Logger.printDetail("-----------------------------------");
- Logger.printDetail("DEBUGGING: tonormalize:"+tonormalize);
- Logger.printDetail("DEBUGGING: mr.group():"+mr.group());
- Logger.printDetail("DEBUGGING: mr.group(1):"+mr.group(1));
- Logger.printDetail("DEBUGGING: m.group():"+m.group());
- Logger.printDetail("DEBUGGING: m.group("+Integer.parseInt(mr.group(1))+"):"+m.group(Integer.parseInt(mr.group(1))));
- Logger.printDetail("-----------------------------------");
-
- tonormalize = tonormalize.replace(mr.group(), m.group(Integer.parseInt(mr.group(1))));
- }
- // replace substrings
- Pattern paSubstring = Pattern.compile("%SUBSTRING%\\((.*?),([0-9]+),([0-9]+)\\)");
- for (MatchResult mr : Toolbox.findMatches(paSubstring,tonormalize)) {
- String substring = mr.group(1).substring(Integer.parseInt(mr.group(2)), Integer.parseInt(mr.group(3)));
- tonormalize = tonormalize.replace(mr.group(),substring);
- }
- if(language.getName().compareTo("arabic") != 0)
- {
- // replace lowercase
- Pattern paLowercase = Pattern.compile("%LOWERCASE%\\((.*?)\\)");
- for (MatchResult mr : Toolbox.findMatches(paLowercase,tonormalize)) {
- String substring = mr.group(1).toLowerCase();
- tonormalize = tonormalize.replace(mr.group(),substring);
- }
-
- // replace uppercase
- Pattern paUppercase = Pattern.compile("%UPPERCASE%\\((.*?)\\)");
- for (MatchResult mr : Toolbox.findMatches(paUppercase,tonormalize)) {
- String substring = mr.group(1).toUpperCase();
- tonormalize = tonormalize.replace(mr.group(),substring);
- }
- }
- // replace sum, concatenation
- Pattern paSum = Pattern.compile("%SUM%\\((.*?),(.*?)\\)");
- for (MatchResult mr : Toolbox.findMatches(paSum,tonormalize)) {
- int newValue = Integer.parseInt(mr.group(1)) + Integer.parseInt(mr.group(2));
- tonormalize = tonormalize.replace(mr.group(), newValue+"");
- }
- // replace normalization function without group
- Pattern paNormNoGroup = Pattern.compile("%([A-Za-z0-9]+?)\\((.*?)\\)");
- for (MatchResult mr : Toolbox.findMatches(paNormNoGroup, tonormalize)) {
- tonormalize = tonormalize.replace(mr.group(),norm.getFromHmAllNormalization(mr.group(1)).get(mr.group(2)));
- }
- // replace Chinese with Arabic numerals
- Pattern paChineseNorm = Pattern.compile("%CHINESENUMBERS%\\((.*?)\\)");
- for (MatchResult mr : Toolbox.findMatches(paChineseNorm, tonormalize)) {
- RegexHashMap chineseNumerals = new RegexHashMap();
- chineseNumerals.put("[零00]", "0");
- chineseNumerals.put("[一11]", "1");
- chineseNumerals.put("[二22]", "2");
- chineseNumerals.put("[三33]", "3");
- chineseNumerals.put("[四44]", "4");
- chineseNumerals.put("[五55]", "5");
- chineseNumerals.put("[六66]", "6");
- chineseNumerals.put("[七77]", "7");
- chineseNumerals.put("[八88]", "8");
- chineseNumerals.put("[九99]", "9");
- String outString = "";
- for(Integer i = 0; i < mr.group(1).length(); i++) {
- String thisChar = mr.group(1).substring(i, i+1);
- if(chineseNumerals.containsKey(thisChar)){
- outString += chineseNumerals.get(thisChar);
- } else {
- System.out.println(chineseNumerals.entrySet());
- Logger.printError(component, "Found an error in the resources: " + mr.group(1) + " contains " +
- "a character that is not defined in the Chinese numerals map. Normalization may be mangled.");
- outString += thisChar;
- }
- }
- tonormalize = tonormalize.replace(mr.group(), outString);
- }
- }
- normalized = tonormalize;
- return normalized;
- }
-
-
- public String[] getAttributesForTimexFromFile(String rule,
- HashMap hmNormalization,
- HashMap hmQuant,
- HashMap hmFreq,
- HashMap hmMod,
- HashMap hmEmptyValue,
- MatchResult m,
- JCas jcas) {
+
+ public String[] getAttributesForTimexFromFile(String key, Rule rule, MatchResult m, JCas jcas) {
String[] attributes = new String[5];
- String value = "";
- String quant = "";
- String freq = "";
- String mod = "";
- String emptyValue = "";
-
+
// Normalize Value
- String value_normalization_pattern = hmNormalization.get(rule);
- value = applyRuleFunctions(value_normalization_pattern, m);
- if (value == null) return null;
-
+ String value_normalization_pattern = rule.getNormalization();
+ NormalizationManager norm = NormalizationManager.getInstance(language, find_temponyms);
+ String value = RuleExpansion.applyRuleFunctions(key, value_normalization_pattern, m, norm, language);
+ if (value == null)
+ return null;
+ // For example "PT24H" -> "P1D"
+ if (group_gran)
+ value = DurationSimplification.simplify(value);
+ attributes[0] = value;
+
// get quant
- if (hmQuant.containsKey(rule)) {
- String quant_normalization_pattern = hmQuant.get(rule);
- quant = applyRuleFunctions(quant_normalization_pattern, m);
- }
+ String quant_normalization_pattern = rule.getQuant();
+ attributes[1] = (quant_normalization_pattern != null) ? RuleExpansion.applyRuleFunctions(key, quant_normalization_pattern, m, norm, language) : "";
// get freq
- if (hmFreq.containsKey(rule)) {
- String freq_normalization_pattern = hmFreq.get(rule);
- freq = applyRuleFunctions(freq_normalization_pattern, m);
- }
-
+ String freq_normalization_pattern = rule.getFreq();
+ attributes[2] = (freq_normalization_pattern != null) ? RuleExpansion.applyRuleFunctions(key, freq_normalization_pattern, m, norm, language) : "";
+
// get mod
- if (hmMod.containsKey(rule)) {
- String mod_normalization_pattern = hmMod.get(rule);
- mod = applyRuleFunctions(mod_normalization_pattern, m);
- }
-
+ String mod_normalization_pattern = rule.getMod();
+ attributes[3] = (mod_normalization_pattern != null) ? RuleExpansion.applyRuleFunctions(key, mod_normalization_pattern, m, norm, language) : "";
+
// get emptyValue
- if (hmEmptyValue.containsKey(rule)) {
- String emptyValue_normalization_pattern = hmEmptyValue.get(rule);
- emptyValue = applyRuleFunctions(emptyValue_normalization_pattern, m);
- emptyValue = correctDurationValue(emptyValue);
- }
- // For example "PT24H" -> "P1D"
- if (group_gran)
- value = correctDurationValue(value);
+ String emptyValue_normalization_pattern = rule.getEmptyValue();
+ attributes[4] = (emptyValue_normalization_pattern != null) ? //
+ DurationSimplification.simplify(RuleExpansion.applyRuleFunctions(key, emptyValue_normalization_pattern, m, norm, language)) : "";
- attributes[0] = value;
- attributes[1] = quant;
- attributes[2] = freq;
- attributes[3] = mod;
- attributes[4] = emptyValue;
-
return attributes;
}
-
/**
- * Durations of a finer granularity are mapped to a coarser one if possible, e.g., "PT24H" -> "P1D".
- * One may add several further corrections.
- * @param value
- * @return
- */
- public String correctDurationValue(String value) {
- if (value.matches("PT[0-9]+H")){
- for (MatchResult mr : Toolbox.findMatches(Pattern.compile("PT([0-9]+)H"), value)){
- try {
- int hours = Integer.parseInt(mr.group(1));
- if ((hours % 24) == 0){
- int days = hours / 24;
- value = "P"+days+"D";
- }
- } catch(NumberFormatException e) {
- Logger.printDetail(component, "Couldn't do granularity conversion for " + value);
- }
- }
- } else if (value.matches("PT[0-9]+M")){
- for (MatchResult mr : Toolbox.findMatches(Pattern.compile("PT([0-9]+)M"), value)){
- try {
- int minutes = Integer.parseInt(mr.group(1));
- if ((minutes % 60) == 0){
- int hours = minutes / 60;
- value = "PT"+hours+"H";
- }
- } catch(NumberFormatException e) {
- Logger.printDetail(component, "Couldn't do granularity conversion for " + value);
- }
- }
- } else if (value.matches("P[0-9]+M")){
- for (MatchResult mr : Toolbox.findMatches(Pattern.compile("P([0-9]+)M"), value)){
- try {
- int months = Integer.parseInt(mr.group(1));
- if ((months % 12) == 0){
- int years = months / 12;
- value = "P"+years+"Y";
- }
- } catch(NumberFormatException e) {
- Logger.printDetail(component, "Couldn't do granularity conversion for " + value);
- }
- }
- }
- return value;
- }
-
- /**
- * Check whether or not a jcas object has a correct DCT value.
- * If there is no DCT present, we canonically return true since
- * fallback calculation takes care of that scenario.
- * @param jcas
- * @return Whether or not the given jcas contains a valid DCT
+ * takes a desired locale input string, iterates through available locales, returns a locale object
+ *
+ * @param locale
+ * String to grab a locale for, i.e. en_US, en_GB, de_DE
+ * @return Locale to represent the input String
*/
- private Boolean isValidDCT(JCas jcas) {
- FSIterator dctIter = jcas.getAnnotationIndex(Dct.type).iterator();
-
- if(!dctIter.hasNext()) {
- return true;
- } else {
- Dct dct = (Dct) dctIter.next();
- String dctVal = dct.getValue();
-
- if(dctVal == null)
- return false;
-
- if(dctVal.matches("\\d{8}") // Something like 20041224
- || dctVal.matches("\\d{4}.\\d{2}.\\d{2}.*")) { // Something like 2004-12-24
- return true;
- } else {
- return false;
- }
- }
+ public static Locale getLocaleFromString(String locale) throws LocaleException {
+ for (Locale l : Locale.getAvailableLocales())
+ if (locale.equalsIgnoreCase(l.toString()))
+ return l;
+ throw new LocaleException();
}
}
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/ProcessorManager.java b/src/de/unihd/dbs/uima/annotator/heideltime/ProcessorManager.java
index 11f06d73..0bd9f8c1 100644
--- a/src/de/unihd/dbs/uima/annotator/heideltime/ProcessorManager.java
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/ProcessorManager.java
@@ -5,9 +5,10 @@
import org.apache.uima.UimaContext;
import org.apache.uima.jcas.JCas;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import de.unihd.dbs.uima.annotator.heideltime.processors.GenericProcessor;
-import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
/**
* This class implements a singleton "Addon Manager". Any subroutine (Processor) that
* may be added to HeidelTime's code to achieve a specific goal which is self-sufficient,
@@ -20,12 +21,13 @@
*
*/
public class ProcessorManager {
+ /** Class logger */
+ private static final Logger LOG = LoggerFactory.getLogger(ProcessorManager.class);
+
// list of processes' package names
private EnumMap> processorNames;
// array of instantiated processors
private EnumMap> processors;
- // self-identifying component for logging purposes
- private Class> component;
// flag for whether the processors have been initialized
private boolean initialized = false;
@@ -34,7 +36,6 @@ public class ProcessorManager {
*/
public ProcessorManager() {
this.processorNames = new EnumMap>(Priority.class);
- this.component = this.getClass();
this.processors = new EnumMap>(Priority.class);
for(Priority prio : Priority.values()) {
@@ -74,8 +75,7 @@ public void initializeAllProcessors(UimaContext aContext) {
p.initialize(aContext);
processors.get(prio).add(p);
} catch (Exception exception) {
- exception.printStackTrace();
- Logger.printError(component, "Unable to initialize registered Processor " + pn + ", got: " + exception.toString());
+ LOG.error("Unable to initialize registered Processor " + pn + ", got: " + exception.toString(), exception);
System.exit(-1);
}
}
@@ -91,7 +91,7 @@ public void initializeAllProcessors(UimaContext aContext) {
*/
public void executeProcessors(JCas jcas, ProcessorManager.Priority prio) {
if(!this.initialized) {
- Logger.printError(component, "Unable to execute Processors; initialization was not concluded successfully.");
+ LOG.error("Unable to execute Processors; initialization was not concluded successfully.");
System.exit(-1);
}
@@ -100,8 +100,7 @@ public void executeProcessors(JCas jcas, ProcessorManager.Priority prio) {
try {
gp.process(jcas);
} catch (Exception exception) {
- exception.printStackTrace();
- Logger.printError(component, "Unable to process registered Processor " + gp.getClass().getName() + ", got: " + exception.toString());
+ LOG.error("Unable to process registered Processor " + gp.getClass().getName() + ", got: " + exception.toString(), exception);
System.exit(-1);
}
}
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/ResolveAmbiguousValues.java b/src/de/unihd/dbs/uima/annotator/heideltime/ResolveAmbiguousValues.java
new file mode 100644
index 00000000..2646ee21
--- /dev/null
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/ResolveAmbiguousValues.java
@@ -0,0 +1,777 @@
+package de.unihd.dbs.uima.annotator.heideltime;
+
+import static de.unihd.dbs.uima.annotator.heideltime.utilities.ContextAnalyzer.*;
+import static de.unihd.dbs.uima.annotator.heideltime.utilities.DateCalculator.*;
+import static de.unihd.dbs.uima.annotator.heideltime.utilities.ParseInteger.*;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.text.AnnotationIndex;
+import org.apache.uima.jcas.JCas;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import de.unihd.dbs.uima.annotator.heideltime.resources.Language;
+import de.unihd.dbs.uima.annotator.heideltime.resources.NormalizationManager;
+import de.unihd.dbs.uima.annotator.heideltime.utilities.ContextAnalyzer.Tense;
+import de.unihd.dbs.uima.annotator.heideltime.utilities.Season;
+import de.unihd.dbs.uima.types.heideltime.Dct;
+import de.unihd.dbs.uima.types.heideltime.Timex3;
+
+class ResolveAmbiguousValues {
+ /** Class logger */
+ private static final Logger LOG = LoggerFactory.getLogger(ResolveAmbiguousValues.class);
+
+ private static final Pattern UNDEF_PATTERN = Pattern.compile("^UNDEF-(this|REFUNIT|REF)-(.*?)-(MINUS|PLUS)-([0-9]+)");
+
+ private static final Pattern UNDEF_UNIT = Pattern.compile("^UNDEF-(last|this|next)-(century|decade|year|quarter|month|week|day)");
+
+ private static final Pattern UNDEF_MONTH = Pattern.compile("^UNDEF-(last|this|next)-(january|february|march|april|may|june|july|august|september|october|november|december)(?:-([0-9][0-9]))?");
+
+ private static final Pattern UNDEF_SEASON = Pattern.compile("^UNDEF-(last|this|next)-(SP|SU|FA|WI)");
+
+ private static final Pattern UNDEF_WEEKDAY = Pattern.compile("^UNDEF-(last|this|next|day)-(monday|tuesday|wednesday|thursday|friday|saturday|sunday)");
+
+ private static final Pattern TWO_DIGITS = Pattern.compile("^\\d\\d$");
+
+ private static final Pattern THREE_DIGITS = Pattern.compile("^\\d\\d\\d$");
+
+ // Document creation time
+ public static class ParsedDct {
+ String dctValue = "";
+ int dctCentury = 0, dctYear = 0, dctDecade = 0, dctMonth = 0, dctDay = 0;
+ Season dctSeason = null;
+ String dctQuarter = "";
+ String dctHalf = "";
+ int dctWeekday = 0, dctWeek = 0;
+
+ private ParsedDct(String dctValue) {
+ // year, month, day as mentioned in the DCT
+ dctYear = parseInt(dctValue, 0, 4);
+ dctCentury = dctYear / 100;
+ dctDecade = parseInt(dctValue, 2, 3);
+ // Could be separated by slashes, or not.
+ if (Character.isDigit(dctValue.charAt(4))) {
+ dctMonth = parseInt(dctValue, 4, 6);
+ dctDay = parseInt(dctValue, 6, 8);
+ } else {
+ dctMonth = parseInt(dctValue, 5, 7);
+ dctDay = parseInt(dctValue, 8, 10);
+ }
+ dctQuarter = getQuarterOfMonth(dctMonth);
+ dctHalf = getHalfYearOfMonth(dctMonth);
+
+ // season, week, weekday, have to be calculated
+ dctSeason = getSeasonOfMonth(dctMonth);
+ dctWeekday = getWeekdayOfDate(dctYear, dctMonth, dctDay);
+ dctWeek = getWeekOfDate(dctYear, dctMonth, dctDay);
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("dctCentury: {}", dctCentury);
+ LOG.debug("dctYear: {}", dctYear);
+ LOG.debug("dctDecade: {}", dctDecade);
+ LOG.debug("dctMonth: {}", dctMonth);
+ LOG.debug("dctDay: {}", dctDay);
+ LOG.debug("dctQuarter: {}", dctQuarter);
+ LOG.debug("dctSeason: {}", dctSeason);
+ LOG.debug("dctWeekday: {}", dctWeekday);
+ LOG.debug("dctWeek: {}", dctWeek);
+ }
+ }
+
+ public static ParsedDct read(JCas jcas) {
+ String dctString = getDct(jcas);
+ return dctString != null ? new ParsedDct(dctString) : null;
+ }
+
+ public static String getDct(JCas jcas) {
+ AnnotationIndex dcts = jcas.getAnnotationIndex(Dct.type);
+ FSIterator dctIter = dcts.iterator();
+ return dctIter.hasNext() ? dctIter.next().getValue() : null;
+ }
+
+ private static final Pattern VALID_DCT = Pattern.compile("^\\d{4}[.-]?\\d{2}[.-]?\\d{2}");
+
+ /**
+ * Check whether or not a jcas object has a correct DCT value. If there is no DCT present, we canonically return true since fallback calculation takes care of that scenario.
+ *
+ * @param jcas
+ * @return Whether or not the given jcas contains a valid DCT
+ */
+ public static boolean isValidDCT(JCas jcas) {
+ String dctString = getDct(jcas);
+ // Something like 20041224 or 2004-12-24
+ return dctString == null || VALID_DCT.matcher(dctString).find();
+ }
+ }
+
+ NormalizationManager norm;
+
+ Language language;
+
+ private DocumentType documentType;
+
+ public void init(Language language, boolean find_temponyms, DocumentType typeToProcess) {
+ if (this.language != language) {
+ this.language = language;
+ norm = NormalizationManager.getInstance(language, find_temponyms);
+ }
+ this.documentType = typeToProcess;
+ }
+
+ public String specifyAmbiguousValuesString(String ambigString, Timex3 t_i, int i, List linearDates, JCas jcas) {
+ if (!ambigString.startsWith("UNDEF"))
+ return ambigString;
+ // If available, parse document creation time:
+ ParsedDct dct = ParsedDct.read(jcas); // was: (documentType != DocumentType.NARRATIVE) ? ParsedDct.read(jcas) : null;
+
+ // get the last tense (depending on the part of speech tags used in front or behind the expression)
+ Tense last_used_tense = getLastTense(t_i, jcas, language);
+
+ // DISAMBIGUATION PHASE:
+ if (ambigString.equals("UNDEF-REFDATE"))
+ return i > 0 ? linearDates.get(i - 1).getTimexValue() : "XXXX-XX-XX";
+ // Different patterns:
+ String repl = handleUndefYear(ambigString, linearDates, i, dct, last_used_tense);
+ repl = repl != null ? repl : handleUndefCentury(ambigString, linearDates, i, dct, last_used_tense);
+ repl = repl != null ? repl : handleUndefPlusMinus(ambigString, linearDates, i, dct);
+ repl = repl != null ? repl : handleUndefNextPrevThis(ambigString, linearDates, i, dct);
+ repl = repl != null ? repl : handleUndefMonth(ambigString, linearDates, i, dct);
+ repl = repl != null ? repl : handleUndefSeason(ambigString, linearDates, i, dct);
+ repl = repl != null ? repl : handleUndefWeekday(ambigString, linearDates, i, dct, last_used_tense);
+ if (repl == null) {
+ LOG.warn("Unhandled UNDEF value: {}", ambigString);
+ return ambigString;
+ }
+ return repl;
+ }
+
+ private String handleUndefPlusMinus(String ambigString, List linearDates, int i, ParsedDct dct) {
+ Matcher m = UNDEF_PATTERN.matcher(ambigString);
+ if (!m.find())
+ return null;
+ boolean fuzz = !ambigString.regionMatches(m.start(1), "REFUNIT", 0, 7);
+ String unit = m.group(2);
+ boolean positive = ambigString.regionMatches(m.start(3), "PLUS", 0, 4); // May only be PLUS or MINUS.
+ try {
+ int diff = parseInt(ambigString, m.start(4), m.end(4));
+ diff = positive ? diff : -diff; // Signed diff
+ String rep = adjustByUnit(linearDates, i, dct, unit, diff, fuzz);
+ if (rep == null)
+ return ambigString;
+ StringBuilder valueNew = join(rep, ambigString, m.end());
+ if ("year".equals(unit))
+ handleFiscalYear(valueNew);
+ return valueNew.toString();
+ } catch (NumberFormatException e) {
+ LOG.error("Invalid integer {} in {}", m.group(4), ambigString);
+ return positive ? "FUTURE_REF" : "PAST_REF";
+ }
+ }
+
+ private String handleUndefNextPrevThis(String ambigString, List linearDates, int i, ParsedDct dct) {
+ Matcher m = UNDEF_UNIT.matcher(ambigString);
+ if (!m.find())
+ return null;
+ String rel = m.group(1), unit = m.group(2);
+ int sdiff = 0;
+ switch (rel) {
+ case "this":
+ break;
+ case "last":
+ sdiff = -1;
+ break;
+ case "next":
+ sdiff = +1;
+ break;
+ default:
+ LOG.warn("Unknown relationship {} in {}", rel, ambigString);
+ return null;
+ }
+ String rep = adjustByUnit(linearDates, i, dct, unit, sdiff, true);
+ if (rep == null)
+ return ambigString;
+ StringBuilder valueNew = join(rep, ambigString, m.end());
+ if ("year".equals(unit))
+ handleFiscalYear(valueNew);
+ return valueNew.toString();
+ }
+
+ /**
+ * Adjust a date.
+ *
+ * @param linearDates
+ * Date mentions
+ * @param i
+ * Position
+ * @param dct
+ * Document creation time
+ * @param unit
+ * Unit
+ * @param sdiff
+ * Difference
+ * @param fuzz
+ * Fuzzing factor
+ * @return Adjusted date, or null.
+ */
+ private String adjustByUnit(List linearDates, int i, ParsedDct dct, String unit, int sdiff, boolean fuzz) {
+ // do the processing for SCIENTIFIC documents (TPZ identification could be improved)
+ if (documentType == DocumentType.SCIENTIFIC)
+ return formatScientific(unit, sdiff);
+ // TODO: BC dates are likely not handled correctly everywhere, although some cases may never occur, because we won't have day information BC.
+ switch (unit) {
+ case "century":
+ if (dct != null)
+ return norm.normNumber(dct.dctCentury + sdiff);
+ String lmCentury = getLastMentionedCentury(linearDates, i);
+ return lmCentury.isEmpty() ? "XX" : getXNextCentury(lmCentury, sdiff);
+ case "decade":
+ if (dct != null)
+ return (Integer.toString(dct.dctYear + sdiff * 10)).substring(0, 3);
+ String lmDecade = getLastMentionedDecade(linearDates, i);
+ return lmDecade.isEmpty() ? "XXXX" : getXNextDecade(lmDecade, sdiff);
+ case "year":
+ if (fuzz) { // Use year precision
+ if (dct != null)
+ return Integer.toString(dct.dctYear + sdiff);
+ String lmYear = getLastMentionedYear(linearDates, i);
+ return lmYear.isEmpty() ? "XXXX" : getXNextYear(lmYear, sdiff);
+ }
+ // Use day precision, if possible
+ // FIXME: Use dct?
+ String dateWithYear = getLastMentionedDateYear(linearDates, i);
+ if (dateWithYear.length() == 0)
+ return "XXXX";
+ // FIXME: clean up BC handling!
+ final int p = dateWithYear.startsWith("BC") ? 6 : 4;
+ String year = dateWithYear.substring(0, p);
+ String rest = dateWithYear.substring(p);
+ String yearNew = getXNextYear(year, sdiff);
+ return yearNew + rest;
+ case "quarter":
+ // TODO: assert not BC?
+ if (dct != null) {
+ // Use quarters, 0 to 3, for computation.
+ int quarters = (dct.dctYear << 2) + parseIntAt(dct.dctQuarter, 1) - 1 + sdiff;
+ return (quarters >> 2) + "-Q" + ((quarters & 0x3) + 1);
+ }
+ String lmQuarter = getLastMentionedQuarter(linearDates, i, language);
+ if (lmQuarter.isEmpty())
+ return "XXXX-XX";
+ // Use quarters, 0 to 3, for computation.
+ int quarters = (parseInt(lmQuarter, 0, 4) << 2) + parseIntAt(lmQuarter, 6) - 1 + sdiff;
+ return (quarters >> 2) + "-Q" + ((quarters & 0x3) + 1);
+ case "month":
+ // TODO: assert not BC?
+ if (dct != null)
+ return getXNextMonth(dct.dctYear + "-" + norm.normNumber(dct.dctMonth), sdiff);
+ String lmMonth = getLastMentionedMonth(linearDates, i);
+ return lmMonth.isEmpty() ? "XXXX-XX" : getXNextMonth(lmMonth, sdiff);
+ case "week":
+ // TODO: assert not BC?
+ if (fuzz /* && (sdiff > 1 || sdiff < -1) */) { // Use week precision
+ if (dct != null)
+ return getXNextWeek(dct.dctYear + "-W" + norm.normNumber(dct.dctWeek), sdiff);
+ String lmWeek = getLastMentionedWeek(linearDates, i);
+ return lmWeek.isEmpty() ? "XXXX-WXX" : getXNextWeek(lmWeek, sdiff);
+ }
+ // Use day precision, if possible
+ if (dct != null)
+ return getXNextDay(dct.dctYear, dct.dctMonth, dct.dctDay, sdiff * 7);
+ String lmDayW = getLastMentionedDay(linearDates, i);
+ return lmDayW.isEmpty() ? "XXXX-WXX" : getXNextDay(lmDayW, sdiff * 7);
+ case "day":
+ if (dct != null)
+ return getXNextDay(dct.dctYear, dct.dctMonth, dct.dctDay, sdiff);
+ String lmDay = getLastMentionedDay(linearDates, i);
+ return lmDay.isEmpty() ? "XXXX-XX-XX" : getXNextDay(lmDay, sdiff);
+ case "minute":
+ case "second":
+ case "hour":
+ // FIXME: support these, too?
+ return null;
+ case "week-WE":
+ // TODO: assert not BC?
+ if (fuzz /* && (sdiff > 1 || sdiff < -1) */) { // Use week precision
+ if (dct != null)
+ return getXNextWeek(dct.dctYear + "-W" + norm.normNumber(dct.dctWeek), sdiff);
+ String lmWeek = getLastMentionedWeek(linearDates, i);
+ return lmWeek.isEmpty() ? "XXXX-WXX-WE" : getXNextWeek(lmWeek, sdiff);
+ }
+ // Use day precision, if possible
+ if (dct != null)
+ return getXNextWeek(dct.dctYear, dct.dctMonth, dct.dctDay, sdiff) + "-WE";
+ String lmWeek = getLastMentionedWeek(linearDates, i);
+ return lmWeek.isEmpty() ? "XXXX-WXX-WE" : getXNextWeek(lmWeek, sdiff) + "-WE";
+ default:
+ LOG.warn("Unknown unit {}", unit);
+ return null;
+ }
+ }
+
+ private String formatScientific(String unit, int sdiff) {
+ final String fmt;
+ switch (unit) {
+ case "year":
+ fmt = "TPZ%c%04d";
+ break;
+ case "month":
+ fmt = "TPZ%c0000-%02d";
+ break;
+ case "week":
+ fmt = "TPZ%c0000-W%02d";
+ break;
+ case "day":
+ fmt = "TPZ%c0000-00-%02d";
+ break;
+ case "hour":
+ fmt = "TPZ%c0000-00-00T%02d";
+ break;
+ case "minute":
+ fmt = "TPZ%c0000-00-00T00:%02d";
+ break;
+ case "second":
+ fmt = "TPZ%c0000-00-00T00:00:%02d";
+ break;
+ default:
+ LOG.error("no scientific format for unit type {}", unit);
+ return null;
+ }
+ return String.format(Locale.ROOT, fmt, sdiff >= 0 ? '+' : '-', Math.abs(sdiff));
+ }
+
+ private String handleUndefYear(String ambigString, List linearDates, int i, ParsedDct dct, Tense last_used_tense) {
+ if (!ambigString.startsWith("UNDEF-year"))
+ return null;
+ last_used_tense = last_used_tense != null ? last_used_tense //
+ // In COLLOQUIAL, default to present/future, otherwise assume past (if undefined).
+ : (documentType == DocumentType.COLLOQUIAL ? Tense.PRESENTFUTURE : Tense.PAST);
+ String[] valueParts = ambigString.split("-");
+ String repl;
+ if (dct != null && valueParts.length > 2) {
+ int newYear = dct.dctYear;
+ String part2 = valueParts[2];
+ Season viThisSeason;
+ // get vi month
+ if (TWO_DIGITS.matcher(part2).matches()) {
+ // FIXME: check range of month and day?
+ int viThisMonth = parseInt(part2);
+ // Get day in vi
+ int viThisDay = (valueParts.length > 3 && TWO_DIGITS.matcher(valueParts[3]).matches()) //
+ ? parseInt(valueParts[3]) : -1;
+ // Tense is FUTURE
+ if (last_used_tense == Tense.FUTURE) { // || last_used_tense == Tense.PRESENTFUTURE) {
+ // if dct-month is larger than vi-month, then add 1 to dct-year
+ if (dct.dctMonth > viThisMonth || //
+ (dct.dctMonth == viThisMonth && viThisDay > 0 && dct.dctDay > viThisDay))
+ ++newYear;
+ }
+ // Tense is PAST
+ else if (last_used_tense == Tense.PAST) {
+ // if dct-month is smaller than vi month, then subtract 1 from dct-year
+ if (dct.dctMonth < viThisMonth || //
+ (dct.dctMonth == viThisMonth && viThisDay > 0 && dct.dctDay < viThisDay))
+ --newYear;
+ }
+ }
+ // get vi season
+ else if ((viThisSeason = Season.of(part2)) != null) {
+ // Tense is FUTURE
+ if (last_used_tense == Tense.FUTURE) { // || last_used_tense == Tense.PRESENTFUTURE) {
+ // if dct-month is larger than vi-month, then add 1 to dct-year
+ if (dct.dctSeason.ord() > viThisSeason.ord())
+ ++newYear;
+ }
+ // Tense is PAST
+ else if (last_used_tense == Tense.PAST) {
+ // if dct-month is smaller than vi month, then subtract 1 from dct-year
+ if (dct.dctSeason.ord() < viThisSeason.ord())
+ --newYear;
+ }
+ }
+ // get vi quarter
+ else if (part2.charAt(0) == 'Q' && part2.charAt(1) >= '1' && part2.charAt(1) <= '4') {
+ // Tense is FUTURE
+ if (last_used_tense == Tense.FUTURE) { // || last_used_tense == Tense.PRESENTFUTURE) {
+ if (parseIntAt(dct.dctQuarter, 1) > parseIntAt(part2, 1))
+ ++newYear;
+ }
+ // Tense is PAST
+ if (last_used_tense == Tense.PAST) {
+ if (parseIntAt(dct.dctQuarter, 1) < parseIntAt(part2, 1))
+ --newYear;
+ }
+ }
+ // get vi half
+ else if (part2.charAt(0) == 'H' && (part2.equals("H1") || part2.equals("H2"))) {
+ // Tense is FUTURE
+ if (last_used_tense == Tense.FUTURE) { // || last_used_tense == Tense.PRESENTFUTURE) {
+ if (parseIntAt(dct.dctHalf, 1) > parseIntAt(part2, 1))
+ ++newYear;
+ }
+ // Tense is PAST
+ if (last_used_tense == Tense.PAST) {
+ if (parseIntAt(dct.dctHalf, 1) < parseIntAt(part2, 1))
+ --newYear;
+ }
+ }
+ // get vi Week
+ else if (part2.charAt(0) == 'W') {
+ // Tense is FUTURE
+ if (last_used_tense == Tense.FUTURE) { // || last_used_tense == Tense.PRESENTFUTURE) {
+ if (dct.dctWeek > parseIntAt(part2, 1))
+ ++newYear;
+ }
+ // Tense is PAST
+ if (last_used_tense == Tense.PAST) {
+ if (dct.dctWeek < parseIntAt(part2, 1))
+ --newYear;
+ }
+ }
+ repl = Integer.toString(newYear);
+ } else {
+ repl = getLastMentionedYear(linearDates, i);
+ if (repl.isEmpty())
+ repl = "XXXX";
+ }
+ // REPLACE THE UNDEF-YEAR WITH THE NEWLY CALCULATED YEAR
+ return join(repl, ambigString, "UNDEF-year".length()).toString();
+ }
+
+ private String handleUndefCentury(String ambigString, List linearDates, int i, ParsedDct dct, Tense last_used_tense) {
+ if (!ambigString.startsWith("UNDEF-century"))
+ return null;
+ String repl = dct != null ? Integer.toString(dct.dctCentury) : "";
+
+ // FIXME: supposed to be NEWS and COLLOQUIAL DOCUMENTS
+ if (dct != null) {
+ int viThisDecade = parseInt(ambigString, 13, 14);
+ // Tense is FUTURE
+ if (last_used_tense == Tense.FUTURE || last_used_tense == Tense.PRESENTFUTURE)
+ repl = Integer.toString(dct.dctCentury + (viThisDecade < dct.dctDecade ? 1 : 0));
+ // Tense is PAST
+ else if (last_used_tense == Tense.PAST)
+ repl = Integer.toString(dct.dctCentury - (dct.dctDecade < viThisDecade ? 1 : 0));
+ }
+ // NARRATIVE DOCUMENTS
+ else {
+ repl = getLastMentionedCentury(linearDates, i);
+ if (!repl.startsWith("BC")) {
+ if (repl.matches("^\\d\\d.*") && parseInt(repl, 0, 2) < 10)
+ repl = "00";
+ } else {
+ repl = "00";
+ }
+ }
+ // LREC change: assume in narrative-style documents that
+ // if no other century was mentioned before, 1st century
+ // Otherwise, assume that sixties, twenties, and so on
+ // are 19XX if no century found (LREC change)
+ if (repl.isEmpty())
+ repl = (documentType == DocumentType.NARRATIVE ? "00" : "19");
+ StringBuilder valueNew = join(repl, ambigString, "UNDEF-century".length());
+ // always assume that sixties, twenties, and so on are 19XX -- if
+ // not narrative document (LREC change)
+ if (documentType != DocumentType.NARRATIVE && THREE_DIGITS.matcher(valueNew).matches())
+ valueNew.replace(0, 2, "19");
+ return valueNew.toString();
+ }
+
+ private String handleUndefMonth(String ambigString, List linearDates, int i, ParsedDct dct) {
+ Matcher m = UNDEF_MONTH.matcher(ambigString);
+ if (!m.find())
+ return null;
+ String ltn = m.group(1), newMonth = norm.getFromNormMonthName(m.group(2)), daystr = m.group(3);
+ String repl = "XXXX-XX";
+ if (ltn.equals("last")) {
+ if (dct != null) {
+ int newYear = dct.dctYear;
+ int newMonthInt = parseInt(newMonth);
+ int day = (daystr != null && daystr.length() > 0) ? parseInt(daystr) : 0;
+ // check day if dct-month and newMonth are equal
+ if (dct.dctMonth == newMonthInt) {
+ if (day != 0 && dct.dctDay <= day)
+ --newYear;
+ } else if (dct.dctMonth <= newMonthInt)
+ --newYear;
+ // TODO: 'format' year? could be < 1000.
+ repl = newYear + "-" + newMonth;
+ } else {
+ String lmMonth = getLastMentionedMonthDetails(linearDates, i);
+ if (!lmMonth.isEmpty()) {
+ int lmMonthInt = parseInt(lmMonth, 5, 7);
+ int lmDayInt = 0;
+ if (lmMonth.length() > 9 && TWO_DIGITS.matcher(lmMonth.subSequence(8, 10)).matches())
+ lmDayInt = parseInt(lmMonth, 8, 10);
+ int newYear = parseInt(lmMonth, 0, 4);
+ int newMonthInt = parseInt(newMonth);
+ int day = (daystr != null && daystr.length() > 0) ? parseInt(daystr) : 0;
+ if (lmMonthInt == newMonthInt) {
+ if (lmDayInt != 0 && day != 0 && lmDayInt <= day)
+ --newYear;
+ } else if (lmMonthInt <= newMonthInt)
+ --newYear;
+ // TODO: 'format' year? could be < 1000.
+ repl = newYear + "-" + newMonth;
+ }
+ }
+ } else if (ltn.equals("this")) {
+ if (dct != null) {
+ // TODO: 'format' year? could be < 1000.
+ repl = dct.dctYear + "-" + newMonth;
+ } else {
+ String lmMonth = getLastMentionedMonthDetails(linearDates, i);
+ if (!lmMonth.isEmpty())
+ repl = lmMonth.substring(0, 4) + "-" + newMonth;
+ }
+ } else if (ltn.equals("next")) {
+ if (dct != null) {
+ int newYear = dct.dctYear;
+ int newMonthInt = parseInt(newMonth);
+ int day = (daystr != null && daystr.length() > 0) ? parseInt(daystr) : 0;
+ // check day if dct-month and newMonth are equal
+ if (dct.dctMonth == newMonthInt) {
+ if (day != 0 && dct.dctDay >= day)
+ ++newYear;
+ } else if (dct.dctMonth >= newMonthInt)
+ ++newYear;
+ // TODO: 'format' year? could be < 1000.
+ repl = newYear + "-" + newMonth;
+ } else {
+ String lmMonth = getLastMentionedMonthDetails(linearDates, i);
+ if (!lmMonth.isEmpty()) {
+ int newYear = parseInt(lmMonth, 0, 4), lmMonthInt = parseInt(lmMonth, 5, 7);
+ int newMonthInt = parseInt(newMonth);
+ if (lmMonthInt >= newMonthInt)
+ ++newYear;
+ // TODO: 'format' year? could be < 1000.
+ repl = newYear + "-" + newMonth;
+ }
+ }
+ } else {
+ LOG.warn("Unhandled undef-month: {}", ltn);
+ }
+ return join(repl, ambigString, m.end()).toString();
+ }
+
+ private String handleUndefSeason(String ambigString, List linearDates, int i, ParsedDct dct) {
+ Matcher m = UNDEF_SEASON.matcher(ambigString);
+ if (!m.find())
+ return null;
+ String ltn = m.group(1);
+ Season newSeason = Season.of(ambigString, m.start(2));
+ String repl = "XXXX-XX";
+ if (ltn.equals("last")) {
+ if (dct != null) {
+ int newYear = dct.dctYear - (newSeason.ord() < dct.dctSeason.ord() //
+ || (dct.dctSeason == Season.WINTER && dct.dctMonth < 12) //
+ ? 1 : 0);
+ // TODO: 'format' year? could be < 1000.
+ repl = newYear + "-" + newSeason;
+ } else { // NARRATVIE DOCUMENT
+ String lmSeason = getLastMentionedSeason(linearDates, i, language);
+ if (lmSeason != null && !lmSeason.isEmpty()) {
+ Season se = Season.of(lmSeason, 5);
+ int newYear = parseInt(lmSeason, 0, 4) - (newSeason.ord() < se.ord() ? 1 : 0);
+ // TODO: 'format' year? could be < 1000.
+ repl = newYear + "-" + newSeason;
+ }
+ }
+ } else if (ltn.equals("this")) {
+ // TODO use tense of sentence?
+ if (dct != null) {
+ // TODO: 'format' year? could be < 1000.
+ repl = dct.dctYear + "-" + newSeason;
+ } else {
+ String lmSeason = getLastMentionedSeason(linearDates, i, language);
+ if (lmSeason != null && !lmSeason.isEmpty())
+ repl = lmSeason.substring(0, 4) + "-" + newSeason;
+ }
+ } else if (ltn.equals("next")) {
+ if (dct != null) {
+ int newYear = dct.dctYear + (newSeason.ord() <= dct.dctSeason.ord() ? 1 : 0);
+ // TODO: 'format' year? could be < 1000.
+ repl = newYear + "-" + newSeason;
+ } else { // NARRATIVE DOCUMENT
+ String lmSeason = getLastMentionedSeason(linearDates, i, language);
+ if (lmSeason != null && !lmSeason.isEmpty()) {
+ Season se = Season.of(lmSeason, 5);
+ int newYear = parseInt(lmSeason, 0, 4) + (newSeason.ord() <= se.ord() ? 1 : 0);
+ // TODO: 'format' year? could be < 1000.
+ repl = newYear + "-" + newSeason;
+ }
+ }
+ } else {
+ LOG.warn("Unhandled undef-season: {}", ltn);
+ }
+ return join(repl, ambigString, m.end()).toString();
+ }
+
+ private String handleUndefWeekday(String ambigString, List linearDates, int i, ParsedDct dct, Tense last_used_tense) {
+ Matcher m = UNDEF_WEEKDAY.matcher(ambigString);
+ if (!m.find())
+ return null;
+ // TODO (before refactoring:) the calculation is strange, but works
+ // But we improved this during refactoring, is it less strange now?
+ // TODO tense should be included?!
+ String ltnd = m.group(1), newWeekday = m.group(2);
+ int newWeekdayInt = parseInt(norm.getFromNormDayInWeek(newWeekday));
+ String repl = "XXXX-XX-XX";
+ if (ltnd.equals("last")) {
+ if (dct != null) {
+ int diff = -(dct.dctWeekday - newWeekdayInt);
+ diff = (diff >= 0) ? diff - 7 : diff;
+ repl = getXNextDay(dct.dctYear, dct.dctMonth, dct.dctDay, diff);
+ } else {
+ String lmDay = getLastMentionedDay(linearDates, i);
+ if (!lmDay.isEmpty()) {
+ int lmWeekdayInt = getWeekdayOfDate(lmDay);
+ int diff = -(lmWeekdayInt - newWeekdayInt);
+ diff = (diff >= 0) ? diff - 7 : diff;
+ repl = getXNextDay(lmDay, diff);
+ }
+ }
+ } else if (ltnd.equals("this")) {
+ if (dct != null) {
+ // TODO tense should be included?!
+ int diff = -(dct.dctWeekday - newWeekdayInt);
+ diff = (diff > 0) ? diff - 7 : diff;
+ repl = getXNextDay(dct.dctYear, dct.dctMonth, dct.dctDay, diff);
+ } else {
+ // TODO tense should be included?!
+ String lmDay = getLastMentionedDay(linearDates, i);
+ if (!lmDay.isEmpty()) {
+ int lmWeekdayInt = getWeekdayOfDate(lmDay);
+ int diff = -(lmWeekdayInt - newWeekdayInt);
+ diff = (diff > 0) ? diff - 7 : diff;
+ repl = getXNextDay(lmDay, diff);
+ }
+ }
+ } else if (ltnd.equals("next")) {
+ if (dct != null) {
+ int diff = newWeekdayInt - dct.dctWeekday;
+ diff = (diff <= 0) ? diff + 7 : diff;
+ repl = getXNextDay(dct.dctYear, dct.dctMonth, dct.dctDay, diff);
+ } else {
+ String lmDay = getLastMentionedDay(linearDates, i);
+ if (!lmDay.isEmpty()) {
+ int lmWeekdayInt = getWeekdayOfDate(lmDay);
+ int diff = newWeekdayInt - lmWeekdayInt;
+ diff = (diff <= 0) ? diff + 7 : diff;
+ repl = getXNextDay(lmDay, diff);
+ }
+ }
+ } else if (ltnd.equals("day")) {
+ if (dct != null) {
+ // TODO tense should be included?!
+ int diff = -(dct.dctWeekday - newWeekdayInt);
+ diff = (diff > 0) ? diff - 7 : diff;
+ // Tense is FUTURE
+ if ((last_used_tense == Tense.FUTURE) && diff != 0)
+ diff += 7;
+ // Tense is PAST
+ // if ((last_used_tense == Tense.PAST)) ?
+ repl = getXNextDay(dct.dctYear, dct.dctMonth, dct.dctDay, diff);
+ } else {
+ // TODO tense should be included?!
+ String lmDay = getLastMentionedDay(linearDates, i);
+ if (!lmDay.isEmpty()) {
+ int lmWeekdayInt = getWeekdayOfDate(lmDay);
+ int diff = -(lmWeekdayInt - newWeekdayInt);
+ diff = (diff > 0) ? diff - 7 : diff;
+ repl = getXNextDay(lmDay, diff);
+ }
+ }
+ } else {
+ LOG.warn("Unhandled undef-weekday: {}", ltnd);
+ }
+ return join(repl, ambigString, m.end()).toString();
+ }
+
+ /**
+ * Join pre-string + post-string beginning at offsetPost, effectively replacing the first offsetPost characters with the pre string.
+ *
+ * @param pre
+ * Prefix
+ * @param post
+ * Postfix
+ * @param offsetPost
+ * Number of chars in postfix to skip.
+ * @return String builder, for futher modification
+ */
+ private static StringBuilder join(String pre, String post, final int offsetPost) {
+ StringBuilder valueNew = new StringBuilder(pre.length() + post.length() - offsetPost);
+ valueNew.append(pre);
+ valueNew.append(post, offsetPost, post.length());
+ return valueNew;
+ }
+
+ /**
+ * Under-specified values are disambiguated here. Only Timexes of types "date" and "time" can be under-specified.
+ *
+ * @param jcas
+ */
+ public void specifyAmbiguousValues(JCas jcas) {
+ // build up a list with all found TIMEX expressions
+ List linearDates = new ArrayList();
+ AnnotationIndex timexes = jcas.getAnnotationIndex(Timex3.type);
+
+ // Create List of all Timexes of types "date" and "time"
+ for (Timex3 timex : timexes) {
+ if (timex.getTimexType().equals("DATE") || timex.getTimexType().equals("TIME"))
+ linearDates.add(timex);
+
+ if (timex.getTimexType().equals("DURATION") && timex.getEmptyValue().length() > 0)
+ linearDates.add(timex);
+ }
+
+ //////////////////////////////////////////////
+ // go through list of Date and Time timexes //
+ //////////////////////////////////////////////
+ for (int i = 0; i < linearDates.size(); i++) {
+ Timex3 t_i = linearDates.get(i);
+ String value_i = t_i.getTimexValue();
+
+ String valueNew = value_i;
+ // handle the value attribute only if we have a TIME or DATE
+ if (t_i.getTimexType().equals("TIME") || t_i.getTimexType().equals("DATE"))
+ valueNew = specifyAmbiguousValuesString(value_i, t_i, i, linearDates, jcas);
+
+ // handle the emptyValue attribute for any type
+ if (t_i.getEmptyValue() != null && t_i.getEmptyValue().length() > 0)
+ t_i.setEmptyValue(specifyAmbiguousValuesString(t_i.getEmptyValue(), t_i, i, linearDates, jcas));
+
+ t_i.removeFromIndexes();
+ if (LOG.isDebugEnabled() && !valueNew.equals(t_i.getTimexValue()))
+ LOG.debug("{} {} DISAMBIGUATION: foundBy: {} text: {} value: {} NEW value: {} ", //
+ t_i.getSentId(), t_i.getTimexId(), t_i.getFoundByRule(), t_i.getCoveredText(), t_i.getTimexValue(), valueNew);
+
+ t_i.setTimexValue(valueNew);
+ t_i.addToIndexes();
+ linearDates.set(i, t_i);
+ }
+ }
+
+ /**
+ * Convert a -FY postfix to a FY prefix.
+ *
+ * @param buf
+ * Buffer to operate on
+ */
+ private static void handleFiscalYear(StringBuilder buf) {
+ if (buf.length() < 4)
+ return;
+ // Unfortunately, StringBuilder does not have and "endsWith".
+ int p = buf.length() - 3;
+ if (buf.charAt(p) == '-' && buf.charAt(++p) == 'F' && buf.charAt(++p) == 'Y') {
+ // Keep at most the year:
+ buf.setLength(Math.min(p, 4));
+ buf.insert(0, "FY");
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/processors/DecadeProcessor.java b/src/de/unihd/dbs/uima/annotator/heideltime/processors/DecadeProcessor.java
index 5f513624..76476c53 100644
--- a/src/de/unihd/dbs/uima/annotator/heideltime/processors/DecadeProcessor.java
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/processors/DecadeProcessor.java
@@ -6,36 +6,32 @@
import java.util.regex.Pattern;
import org.apache.uima.UimaContext;
-import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;
import de.unihd.dbs.uima.types.heideltime.Timex3;
public class DecadeProcessor extends GenericProcessor {
-
/**
* Constructor just calls the parent constructor here.
*/
public DecadeProcessor() {
super();
}
-
/**
- * not needed here
+ * not needed here
*/
public void initialize(UimaContext aContext) {
return;
}
-
+
/**
* all the functionality was put into evaluateCalculationFunctions().
*/
public void process(JCas jcas) {
evaluateFunctions(jcas);
}
-
-
+
/**
* This function replaces function calls from the resource files with their TIMEX value.
*
@@ -43,44 +39,33 @@ public void process(JCas jcas) {
* @param jcas
*/
public void evaluateFunctions(JCas jcas) {
-
// build up a list with all found TIMEX expressions
List linearDates = new ArrayList();
- FSIterator iterTimex = jcas.getAnnotationIndex(Timex3.type).iterator();
+ Iterable timexes = jcas.getAnnotationIndex(Timex3.type);
// Create List of all Timexes of types "date" and "time"
- while (iterTimex.hasNext()) {
- Timex3 timex = (Timex3) iterTimex.next();
- if (timex.getTimexType().equals("DATE")) {
+ for (Timex3 timex : timexes)
+ if (timex.getTimexType().equals("DATE"))
linearDates.add(timex);
- }
- }
-
-
+
//////////////////////////////////////////////
// go through list of Date and Time timexes //
//////////////////////////////////////////////
- //compile regex pattern for validating commands/arguments
- Pattern cmd_p = Pattern.compile("(\\w\\w\\w\\w)-(\\w\\w)-(\\w\\w)\\s+decadeCalc\\((\\d+)\\)");
+ // compile regex pattern for validating commands/arguments
+ Matcher cmd_p = Pattern.compile("(\\w\\w\\w\\w)-(\\w\\w)-(\\w\\w)\\s+decadeCalc\\((\\d+)\\)").matcher("");
- Matcher cmd_m;
- String year;
- String valueNew;
- String argument;
-
for (int i = 0; i < linearDates.size(); i++) {
- Timex3 t_i = (Timex3) linearDates.get(i);
+ Timex3 t_i = linearDates.get(i);
String value_i = t_i.getTimexValue();
- cmd_m = cmd_p.matcher(value_i);
- valueNew = value_i;
-
- if(cmd_m.matches()) {
- year = cmd_m.group(1);
- argument = cmd_m.group(4);
-
+ Matcher cmd_m = cmd_p.reset(value_i);
+ String valueNew = value_i;
+
+ if (cmd_m.matches()) {
+ String year = cmd_m.group(1);
+ String argument = cmd_m.group(4);
valueNew = year.substring(0, Math.min(2, year.length())) + argument.substring(0, 1);
}
-
+
t_i.removeFromIndexes();
t_i.setTimexValue(valueNew);
t_i.addToIndexes();
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/processors/HolidayProcessor.java b/src/de/unihd/dbs/uima/annotator/heideltime/processors/HolidayProcessor.java
index 56bc67c5..12bb8dbf 100644
--- a/src/de/unihd/dbs/uima/annotator/heideltime/processors/HolidayProcessor.java
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/processors/HolidayProcessor.java
@@ -1,27 +1,30 @@
package de.unihd.dbs.uima.annotator.heideltime.processors;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
+import java.time.LocalDate;
+import java.time.format.DateTimeFormatter;
+import java.time.format.DateTimeParseException;
+import java.time.temporal.WeekFields;
import java.util.ArrayList;
-import java.util.Calendar;
-import java.util.Date;
-import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.UimaContext;
-import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.jcas.JCas;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
-import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import de.unihd.dbs.uima.types.heideltime.Timex3;
+
/**
- * Addition to HeidelTime to recognize several (mostly, but not
- * entirely christian) holidays.
+ * Addition to HeidelTime to recognize several (mostly, but not entirely christian) holidays.
+ *
* @author Hans-Peter Pfeiffer
*
*/
public class HolidayProcessor extends GenericProcessor {
+ /** Class logger */
+ private static final Logger LOG = LoggerFactory.getLogger(HolidayProcessor.class);
/**
* Constructor just calls the parent constructor here.
@@ -29,23 +32,25 @@ public class HolidayProcessor extends GenericProcessor {
public HolidayProcessor() {
super();
}
-
/**
- * not needed here
+ * not needed here
*/
public void initialize(UimaContext aContext) {
return;
}
-
+
/**
* all the functionality was put into evaluateCalculationFunctions().
*/
public void process(JCas jcas) {
evaluateCalculationFunctions(jcas);
}
-
-
+
+ Pattern cmd_p = Pattern.compile("((\\w\\w\\w\\w)-(\\w\\w)-(\\w\\w))\\s+funcDateCalc\\((\\w+)\\((.+)\\)\\)");
+ Pattern year_p = Pattern.compile("(\\d\\d\\d\\d)");
+ Pattern date_p = Pattern.compile("(\\d\\d\\d\\d)-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])");
+
/**
* This function replaces function calls from the resource files with their TIMEX value.
*
@@ -53,138 +58,87 @@ public void process(JCas jcas) {
* @param jcas
*/
public void evaluateCalculationFunctions(JCas jcas) {
+ // compile regex pattern for validating commands/arguments
+ Matcher cmd_m = cmd_p.matcher("");
- // build up a list with all found TIMEX expressions
- List linearDates = new ArrayList();
- FSIterator iterTimex = jcas.getAnnotationIndex(Timex3.type).iterator();
+ AnnotationIndex timexes = jcas.getAnnotationIndex(Timex3.type);
+ // Avoid concurrent modification exceptions
+ ArrayList copy = new ArrayList(timexes.size());
+ for (Timex3 timex : timexes)
+ if (timex.getTimexType().equals("DATE") || timex.getTimexType().equals("TIME"))
+ copy.add(timex);
- // Create List of all Timexes of types "date" and "time"
- while (iterTimex.hasNext()) {
- Timex3 timex = (Timex3) iterTimex.next();
- if ((timex.getTimexType().equals("DATE")) || (timex.getTimexType().equals("TIME"))) {
- linearDates.add(timex);
- }
- }
-
-
- //////////////////////////////////////////////
- // go through list of Date and Time timexes //
- //////////////////////////////////////////////
- //compile regex pattern for validating commands/arguments
- Pattern cmd_p = Pattern.compile("((\\w\\w\\w\\w)-(\\w\\w)-(\\w\\w))\\s+funcDateCalc\\((\\w+)\\((.+)\\)\\)");
- Pattern year_p = Pattern.compile("(\\d\\d\\d\\d)");
- Pattern date_p = Pattern.compile("(\\d\\d\\d\\d)-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])");
- Matcher cmd_m;
- Matcher year_m;
- Matcher date_m;
- String date;
- String year;
- String month;
- String day;
- String function;
- String args[];
- String valueNew;
-
- for (int i = 0; i < linearDates.size(); i++) {
- Timex3 t_i = (Timex3) linearDates.get(i);
- String value_i = t_i.getTimexValue();
- cmd_m = cmd_p.matcher(value_i);
- valueNew = value_i;
-
- if(cmd_m.matches()) {
- date = cmd_m.group(1);
- year = cmd_m.group(2);
- month = cmd_m.group(3);
- day = cmd_m.group(4);
- function = cmd_m.group(5);
- args = cmd_m.group(6).split("\\s*,\\s*");
-
- //replace keywords in function with actual values
- for(int j=0; j0) || (!count_itself && number <= 0)) {
- if(day<=weekday) {
- add = weekday - day;
- }
- else{
- add = weekday - day + 7;
- }
- }
- else{
- if(day 0) || (!count_itself && number <= 0)) {
+ add = (day <= weekday) ? weekday - day : weekday - day + 7;
+ } else {
+ add = (day < weekday) ? weekday - day : weekday - day + 7;
+ }
+ add += ((number - 1) * 7);
+ return d.plusDays(add).format(FORMATTER);
+ } catch (DateTimeParseException e) {
+ LOG.error(e.getMessage(), e);
+ return "";
}
}
-
-
+
/**
* Get the date of a the first, second, third etc. weekday in a month
*
@@ -402,25 +290,19 @@ public String getWeekdayOfMonth(int number, int weekday, int month, int year) {
return getWeekdayRelativeTo(String.format("%04d-%02d-01", year, month), weekday, number, true);
}
- private int getJulianDifference(int year){
- //TODO: this is not entirely correct!
- int century = year/100 + 1;
- if(century<18){
- return 10;
- }
- if(century==18){
- return 11;
- }
- if(century==19){
- return 12;
- }
- if(century==20||century == 21){
- return 13;
- }
- if(century==22){
- return 14;
- }
- return 15;
- }
-
+ private int getJulianDifference(int year) {
+ // FIXME: this is not entirely correct!
+ int century = year / 100 + 1;
+ if (century < 18)
+ return 10;
+ if (century == 18)
+ return 11;
+ if (century == 19)
+ return 12;
+ if (century == 20 || century == 21)
+ return 13;
+ if (century == 22)
+ return 14;
+ return 15;
+ }
}
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/processors/ProcessorInitializationException.java b/src/de/unihd/dbs/uima/annotator/heideltime/processors/ProcessorInitializationException.java
index 46cd8134..d368c161 100644
--- a/src/de/unihd/dbs/uima/annotator/heideltime/processors/ProcessorInitializationException.java
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/processors/ProcessorInitializationException.java
@@ -3,10 +3,8 @@
import de.unihd.dbs.uima.annotator.heideltime.HeidelTimeException;
public class ProcessorInitializationException extends HeidelTimeException {
-
/**
- *
+ * Serialization version
*/
private static final long serialVersionUID = -4036889037291484936L;
-
}
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/processors/ProcessorProcessingException.java b/src/de/unihd/dbs/uima/annotator/heideltime/processors/ProcessorProcessingException.java
index 315f9b2d..17dbc6c2 100644
--- a/src/de/unihd/dbs/uima/annotator/heideltime/processors/ProcessorProcessingException.java
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/processors/ProcessorProcessingException.java
@@ -3,10 +3,8 @@
import de.unihd.dbs.uima.annotator.heideltime.HeidelTimeException;
public class ProcessorProcessingException extends HeidelTimeException {
-
/**
- *
+ * Serialization version
*/
private static final long serialVersionUID = 6123306006146166368L;
-
}
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/processors/TemponymPostprocessing.java b/src/de/unihd/dbs/uima/annotator/heideltime/processors/TemponymPostprocessing.java
index 67d0c3c1..fcb0fa3f 100644
--- a/src/de/unihd/dbs/uima/annotator/heideltime/processors/TemponymPostprocessing.java
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/processors/TemponymPostprocessing.java
@@ -1,81 +1,77 @@
package de.unihd.dbs.uima.annotator.heideltime.processors;
import java.util.HashSet;
-import java.util.regex.MatchResult;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.jcas.JCas;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
-import de.unihd.dbs.uima.annotator.heideltime.utilities.Toolbox;
import de.unihd.dbs.uima.types.heideltime.Timex3;
import de.unihd.dbs.uima.types.heideltime.Timex3Interval;
/**
+ * This class removes TIMEX3 annotations for temponyms and adds TIMEX3INTERVAL annotations containing (earliest|latest)(Begin|End) information.
*
- * This class removes TIMEX3 annotations for temponyms and adds
- * TIMEX3INTERVAL annotations containing (earliest|latest)(Begin|End) information.
* @author jannik stroetgen
- *
*/
public class TemponymPostprocessing {
-
- public static void handleIntervals(JCas jcas){
-
+ private static final Logger LOG = LoggerFactory.getLogger(TemponymPostprocessing.class);
+
+ private static final Pattern p = Pattern.compile("\\[(.*?), (.*?), (.*?), (.*?)\\]");
+
+ public static void handleIntervals(JCas jcas) {
HashSet timexes = new HashSet<>();
-
+
+ Matcher mr = p.matcher("");
// iterate over all TEMPONYMS
- FSIterator iterTimex = jcas.getAnnotationIndex(Timex3.type).iterator();
- while (iterTimex.hasNext()) {
- Timex3 t = (Timex3) iterTimex.next();
- if (t.getTimexType().equals("TEMPONYM")) {
-
- // create a timex3interval for each temponym
- Timex3Interval ti = new Timex3Interval(jcas);
+ AnnotationIndex timex3s = jcas.getAnnotationIndex(Timex3.type);
+ for (Timex3 t : timex3s) {
+ if (!t.getTimexType().equals("TEMPONYM"))
+ continue;
+ LOG.debug("TEMPONYM: {}", t.getCoveredText());
+ // create a timex3interval for each temponym
+ Timex3Interval ti = new Timex3Interval(jcas);
- System.err.println("TEMPONYM: " + t.getCoveredText());
-
- ti.setBegin(t.getBegin());
- ti.setEnd(t.getEnd());
- ti.setTimexType(t.getTimexType());
- ti.setAllTokIds(t.getAllTokIds());
- ti.setTimexFreq(t.getTimexFreq());
- ti.setTimexMod(t.getTimexMod());
- ti.setTimexQuant(t.getTimexQuant());
- // set a new id
- String id = t.getTimexId();
- int newId = Integer.parseInt(id.replace("t", ""));
- newId += 100000;
- ti.setTimexId("t" + newId);
+ ti.setBegin(t.getBegin());
+ ti.setEnd(t.getEnd());
+ ti.setTimexType(t.getTimexType());
+ ti.setAllTokIds(t.getAllTokIds());
+ ti.setTimexFreq(t.getTimexFreq());
+ ti.setTimexMod(t.getTimexMod());
+ ti.setTimexQuant(t.getTimexQuant());
+ // set a new id
+ String id = t.getTimexId();
+ int newId = Integer.parseInt(id.replace("t", ""));
+ newId += 100000;
+ ti.setTimexId("t" + newId);
- // get the (earliest|last)(begin|end) information
- Pattern p = Pattern.compile("\\[(.*?), (.*?), (.*?), (.*?)\\]");
- for (MatchResult mr : Toolbox.findMatches(p,t.getTimexValue())) {
- ti.setTimexValueEB(mr.group(1));
- ti.setTimexValueLB(mr.group(2));
- ti.setTimexValueEE(mr.group(3));
- ti.setTimexValueLE(mr.group(4));
- }
- //System.err.println("temponym: " + t.getTimexValue());
- if ((ti.getTimexValueEB() == ti.getTimexValueLB()) &&
- (ti.getTimexValueLB() == ti.getTimexValueEE()) &&
- (ti.getTimexValueEE() == ti.getTimexValueLE())) {
- ti.setTimexValue(ti.getTimexValueEB());
- t.setTimexValue(ti.getTimexValueEB());
- }
- else { // what's the best single value for an interval!?
- t.setEmptyValue(t.getTimexValue());
- ti.setTimexValue(ti.getTimexValueLE());
- t.setTimexValue(ti.getTimexValueLE());
- }
- ti.setFoundByRule(t.getFoundByRule());
- ti.addToIndexes();
- timexes.add(t);
+ // get the (earliest|last)(begin|end) information
+ for (mr.reset(t.getTimexValue()); mr.find();) {
+ ti.setTimexValueEB(mr.group(1));
+ ti.setTimexValueLB(mr.group(2));
+ ti.setTimexValueEE(mr.group(3));
+ ti.setTimexValueLE(mr.group(4));
}
+ // System.err.println("temponym: " + t.getTimexValue());
+ if (ti.getTimexValueEB().equals(ti.getTimexValueLB()) && //
+ ti.getTimexValueLB().equals(ti.getTimexValueEE()) && //
+ ti.getTimexValueEE().equals(ti.getTimexValueLE())) {
+ ti.setTimexValue(ti.getTimexValueEB());
+ t.setTimexValue(ti.getTimexValueEB());
+ } else { // what's the best single value for an interval!?
+ t.setEmptyValue(t.getTimexValue());
+ ti.setTimexValue(ti.getTimexValueLE());
+ t.setTimexValue(ti.getTimexValueLE());
+ }
+ ti.setFoundByRule(t.getFoundByRule());
+ ti.addToIndexes();
+ timexes.add(t);
}
// shall the standard timexes really be removed?
- for (Timex3 t : timexes){
+ for (Timex3 t : timexes)
t.removeFromIndexes();
- }
}
}
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/GenericResourceManager.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/GenericResourceManager.java
index 9eb233b7..29f7a0f1 100644
--- a/src/de/unihd/dbs/uima/annotator/heideltime/resources/GenericResourceManager.java
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/GenericResourceManager.java
@@ -1,32 +1,77 @@
package de.unihd.dbs.uima.annotator.heideltime.resources;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
/**
*
- * Abstract class for all Resource Managers to inherit from. Contains basic
- * functionality such as file system access and some private members.
+ * Abstract class for all Resource Managers to inherit from. Contains basic functionality such as file system access and some private members.
*
*/
public abstract class GenericResourceManager {
// language for the utilized resources
- protected String LANGUAGE;
+ protected final String LANGUAGE;
// kind of resource -- e.g. repattern, normalization, rules
protected String resourceType;
- // local package for logging output
- protected Class> component;
-
+
/**
* Instantiates the Resource Manager with a resource type
- * @param resourceType kind of resource to represent
+ *
+ * @param resourceType
+ * kind of resource to represent
*/
protected GenericResourceManager(String resourceType, String language) {
this.resourceType = resourceType;
this.LANGUAGE = language;
- this.component = this.getClass();
}
-
- protected String replaceSpaces(String inText) {
- String outText = inText.replaceAll(" ", "[\\\\u2000-\\\\u200A \\\\u202F\\\\u205F\\\\u3000\\\\u00A0\\\\u1680\\\\u180E]+");
-
- return outText;
+
+ private static final Pattern WHITESPACE = Pattern.compile("(?: |\\\\[sS])");
+
+ public static String replaceSpaces(String inText) {
+ Matcher m = WHITESPACE.matcher(inText);
+ if (!m.find())
+ return inText;
+ final int len = inText.length();
+ StringBuilder buf = new StringBuilder();
+ int lastpos = 0;
+ do {
+ int start = m.start(), end = m.end();
+ final char lastchar = inText.charAt(end - 1);
+ assert (lastchar == ' ' || lastchar == 's' || lastchar == 'S');
+ boolean negative = lastchar == 'S';
+ boolean chargroup = false;
+ String extra = "+"; // By default, insert a plus.
+ if (end < len) {
+ char next = inText.charAt(end);
+ if (next == '?' || next == '*' || next == '+' || next == '{')
+ extra = null; // Preserve
+ if (next == ']' && start > 0 && inText.charAt(start - 1) == '[') {
+
+ }
+ }
+ for (int s = end; s < len; s++) {
+ char next = inText.charAt(s);
+ if (next == '[' && inText.charAt(s - 1) != '\\')
+ break; // Supposedly not in a character group.
+ if (next == ']' && inText.charAt(s - 1) != '\\') {
+ chargroup = true;
+ break;
+ }
+ }
+ buf.append(inText, lastpos, start);
+ if (chargroup) {
+ // buf.append(negative ? "\\P{javaWhitespace}" : "\\p{javaWhitespace}");
+ buf.append(negative ? "\\S" : "\\s");
+ } else {
+ // buf.append(negative ? "[\\P{javaWhitespace}]" : "[\\p{javaWhitespace}]");
+ buf.append(negative ? "\\S" : "\\s");
+ if (extra != null)
+ buf.append(extra);
+ }
+ lastpos = end;
+ } while (m.find());
+ if (lastpos < len)
+ buf.append(inText, lastpos, len);
+ return buf.toString();
}
}
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/Language.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/Language.java
index 79f9c6c2..11637dfb 100644
--- a/src/de/unihd/dbs/uima/annotator/heideltime/resources/Language.java
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/Language.java
@@ -1,6 +1,6 @@
package de.unihd.dbs.uima.annotator.heideltime.resources;
-import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
+import org.slf4j.LoggerFactory;
/**
* Hardcoded Language information for use with HeidelTime/Standalone. Contains
@@ -61,7 +61,7 @@ public enum Language {
*/
public final static Language getLanguageFromString(String name) {
if(name == null) {
- Logger.printError("Language parameter was specified as NULL.");
+ LoggerFactory.getLogger(Language.class).error("Language parameter was specified as NULL.");
throw new NullPointerException();
}
@@ -99,6 +99,10 @@ public final String getResourceFolder() {
return this.resourceFolder;
}
+ public final boolean useLowercase() {
+ return this != ARABIC;
+ }
+
@Override
public String toString() {
return getName();
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/NormalizationManager.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/NormalizationManager.java
index 2a6a96c8..ccdf7fd8 100644
--- a/src/de/unihd/dbs/uima/annotator/heideltime/resources/NormalizationManager.java
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/NormalizationManager.java
@@ -5,22 +5,24 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
-import java.util.regex.MatchResult;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import de.unihd.dbs.uima.annotator.heideltime.utilities.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
/**
- *
* This class fills the role of a manager of all the Normalization resources.
* It reads the data from a file system and fills up a bunch of HashMaps
* with their information.
+ *
* @author jannik stroetgen
- *
*/
public class NormalizationManager extends GenericResourceManager {
+ /** Class logger */
+ private static final Logger LOG = LoggerFactory.getLogger(NormalizationManager.class);
+
protected static HashMap instances = new HashMap();
- // PATTERNS TO READ RESOURCES "RULES" AND "NORMALIZATION"
- private Pattern paReadNormalizations = Pattern.compile("\"(.*?)\",\"(.*?)\"");
// STORE PATTERNS AND NORMALIZATIONS
private HashMap> hmAllNormalization;
@@ -32,6 +34,8 @@ public class NormalizationManager extends GenericResourceManager {
private HashMap normMonthInSeason;
private HashMap normMonthInQuarter;
+ private String[] normNumbers;
+
/**
* Constructor calls the parent constructor that sets language/resource parameters,
* initializes basic and collects resource normalization patterns.
@@ -60,9 +64,8 @@ private NormalizationManager(String language, Boolean load_temponym_resources) {
ResourceScanner rs = ResourceScanner.getInstance();
ResourceMap hmResourcesNormalization = rs.getNormalizations(language);
- for (String which : hmResourcesNormalization.keySet()) {
+ for (String which : hmResourcesNormalization.keySet())
hmAllNormalization.put(which, new RegexHashMap());
- }
readNormalizationResources(hmResourcesNormalization, load_temponym_resources);
}
@@ -72,12 +75,12 @@ private NormalizationManager(String language, Boolean load_temponym_resources) {
* @return singleton instance of NormalizationManager
*/
public static NormalizationManager getInstance(Language language, Boolean load_temponym_resources) {
- if(!instances.containsKey(language.getName())) {
- NormalizationManager nm = new NormalizationManager(language.getResourceFolder(), load_temponym_resources);
+ NormalizationManager nm = instances.get(language.getName());
+ if(nm == null) {
+ nm = new NormalizationManager(language.getResourceFolder(), load_temponym_resources);
instances.put(language.getName(), nm);
}
-
- return instances.get(language.getName());
+ return nm;
}
/**
@@ -87,61 +90,40 @@ public static NormalizationManager getInstance(Language language, Boolean load_t
* @param load_temponym_resources whether temponym resources are loaded
*/
public void readNormalizationResources(ResourceMap hmResourcesNormalization, Boolean load_temponym_resources) {
+ // PATTERNS TO READ RESOURCES "RULES" AND "NORMALIZATION"
+ Matcher maReadNormalizations = Pattern.compile("\"(.*?)\",\"(.*?)\"").matcher("");
+ for (String resource : hmResourcesNormalization.keySet()) {
+ // read normalization resources with "Temponym" only if temponym tagging is selected
+ if (resource.contains("Temponym") &&
+ !(load_temponym_resources && resource.contains("Temponym"))) {
+ LOG.trace("No Temponym tagging selected. Skipping normalization resource: {}", resource);
+ continue;
+ }
+ LOG.debug("Adding normalization resource: {}", resource);
+ // create a buffered reader for every normalization resource file
+ try(InputStream is = hmResourcesNormalization.getInputStream(resource); //
+ InputStreamReader isr = new InputStreamReader(is, "UTF-8");//
+ BufferedReader br = new BufferedReader(isr)) {
+ for (String line; (line=br.readLine()) != null; ) {
+ if (line.startsWith("//") || line.length() == 0) continue; // ignore comments and empty lines
- InputStream is = null;
- InputStreamReader isr = null;
- BufferedReader br = null;
- try {
- for (String resource : hmResourcesNormalization.keySet()) {
- // read normalization resources with "Temponym" only if temponym tagging is selected
- if ( (!(resource.contains("Temponym"))) ||
- ((load_temponym_resources) && (resource.contains("Temponym")))){
-
- Logger.printDetail(component, "Adding normalization resource: "+resource);
- // create a buffered reader for every normalization resource file
- is = hmResourcesNormalization.getInputStream(resource);
- isr = new InputStreamReader(is, "UTF-8");
- br = new BufferedReader(isr);
- for ( String line; (line=br.readLine()) != null; ) {
- if (line.startsWith("//")) continue; // ignore comments
-
- // check each line for the normalization format (defined in paReadNormalizations)
- boolean correctLine = false;
- for (MatchResult r : Toolbox.findMatches(paReadNormalizations, line)) {
- correctLine = true;
- String resource_word = replaceSpaces(r.group(1));
- String normalized_word = r.group(2);
- for (String which : hmAllNormalization.keySet()) {
- if (resource.equals(which)) {
- hmAllNormalization.get(which).put(resource_word,normalized_word);
- }
- }
- if ((correctLine == false) && (!(line.matches("")))) {
- Logger.printError("["+component+"] Cannot read one of the lines of normalization resource "+resource);
- Logger.printError("["+component+"] Line: "+line);
- }
+ // check each line for the normalization format (defined in paReadNormalizations)
+ maReadNormalizations.reset(line);
+ if (!maReadNormalizations.find()) {
+ LOG.error("Cannot read one of the lines of normalization resource {}\nLine: {}", resource, line);
+ continue;
+ }
+ String resource_word = maReadNormalizations.group(1);
+ String normalized_word = maReadNormalizations.group(2);
+ for (String which : hmAllNormalization.keySet()) {
+ if (resource.equals(which)) {
+ hmAllNormalization.get(which).put(resource_word, normalized_word);
}
}
}
- else {
- Logger.printDetail(component, "No Temponym Tagging selected. Skipping normalization resource: "+resource);
- }
- }
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- if(br != null) {
- br.close();
- }
- if(isr != null) {
- isr.close();
- }
- if(is != null) {
- is.close();
- }
- } catch(Exception e) {
- e.printStackTrace();
+ } catch (IOException e) {
+ LOG.error(e.getMessage(), e);
+ System.exit(1);
}
}
}
@@ -151,7 +133,6 @@ public void readNormalizationResources(ResourceMap hmResourcesNormalization, Boo
* sets a couple of rudimentary normalization parameters
*/
private void readGlobalNormalizationInformation() {
-
// MONTH IN QUARTER
normMonthInQuarter.put("01","1");
normMonthInQuarter.put("02","1");
@@ -167,7 +148,7 @@ private void readGlobalNormalizationInformation() {
normMonthInQuarter.put("12","4");
// MONTH IN SEASON
- normMonthInSeason.put("", "");
+ normMonthInSeason.put("", ""); // FIXME: why?
normMonthInSeason.put("01","WI");
normMonthInSeason.put("02","WI");
normMonthInSeason.put("03","SP");
@@ -182,34 +163,34 @@ private void readGlobalNormalizationInformation() {
normMonthInSeason.put("12","WI");
// DAY IN WEEK
- normDayInWeek.put("sunday","1");
- normDayInWeek.put("monday","2");
- normDayInWeek.put("tuesday","3");
- normDayInWeek.put("wednesday","4");
- normDayInWeek.put("thursday","5");
- normDayInWeek.put("friday","6");
- normDayInWeek.put("saturday","7");
- normDayInWeek.put("Sunday","1");
- normDayInWeek.put("Monday","2");
- normDayInWeek.put("Tuesday","3");
- normDayInWeek.put("Wednesday","4");
- normDayInWeek.put("Thursday","5");
- normDayInWeek.put("Friday","6");
- normDayInWeek.put("Saturday","7");
-// normDayInWeek.put("sunday","7");
-// normDayInWeek.put("monday","1");
-// normDayInWeek.put("tuesday","2");
-// normDayInWeek.put("wednesday","3");
-// normDayInWeek.put("thursday","4");
-// normDayInWeek.put("friday","5");
-// normDayInWeek.put("saturday","6");
-// normDayInWeek.put("Sunday","7");
-// normDayInWeek.put("Monday","1");
-// normDayInWeek.put("Tuesday","2");
-// normDayInWeek.put("Wednesday","3");
-// normDayInWeek.put("Thursday","4");
-// normDayInWeek.put("Friday","5");
-// normDayInWeek.put("Saturday","6");
+// normDayInWeek.put("sunday","1");
+// normDayInWeek.put("monday","2");
+// normDayInWeek.put("tuesday","3");
+// normDayInWeek.put("wednesday","4");
+// normDayInWeek.put("thursday","5");
+// normDayInWeek.put("friday","6");
+// normDayInWeek.put("saturday","7");
+// normDayInWeek.put("Sunday","1");
+// normDayInWeek.put("Monday","2");
+// normDayInWeek.put("Tuesday","3");
+// normDayInWeek.put("Wednesday","4");
+// normDayInWeek.put("Thursday","5");
+// normDayInWeek.put("Friday","6");
+// normDayInWeek.put("Saturday","7");
+ normDayInWeek.put("sunday","7");
+ normDayInWeek.put("monday","1");
+ normDayInWeek.put("tuesday","2");
+ normDayInWeek.put("wednesday","3");
+ normDayInWeek.put("thursday","4");
+ normDayInWeek.put("friday","5");
+ normDayInWeek.put("saturday","6");
+ normDayInWeek.put("Sunday","7");
+ normDayInWeek.put("Monday","1");
+ normDayInWeek.put("Tuesday","2");
+ normDayInWeek.put("Wednesday","3");
+ normDayInWeek.put("Thursday","4");
+ normDayInWeek.put("Friday","5");
+ normDayInWeek.put("Saturday","6");
// NORM MINUTE
@@ -285,6 +266,12 @@ private void readGlobalNormalizationInformation() {
normNumber.put("59","59");
normNumber.put("60","60");
+ normNumbers = new String[61];
+ for (int i = 0; i < 10; i++)
+ normNumbers[i] = "0"+i;
+ for (int i = 10; i <= 60; i++)
+ normNumbers[i] = Integer.toString(i);
+
// NORM MONTH
normMonthName.put("january","01");
normMonthName.put("february","02");
@@ -310,6 +297,10 @@ public final String getFromNormNumber(String key) {
return normNumber.get(key);
}
+ public final String normNumber(int key) {
+ return key >= 0 && key <= 60 ? normNumbers[key] : null;
+ }
+
public final String getFromNormDayInWeek(String key) {
return normDayInWeek.get(key);
}
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/RePatternManager.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/RePatternManager.java
index c04388d0..a5ea5914 100644
--- a/src/de/unihd/dbs/uima/annotator/heideltime/resources/RePatternManager.java
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/RePatternManager.java
@@ -4,30 +4,42 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collections;
-import java.util.Comparator;
import java.util.HashMap;
-import java.util.LinkedList;
+import java.util.List;
import java.util.TreeMap;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import de.unihd.dbs.uima.annotator.heideltime.utilities.RegexpOptimizer;
+import de.unihd.dbs.uima.annotator.heideltime.utilities.RegexpOptimizer.OptimizerException;
-import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
/**
*
- * This class fills the role of a manager of all the RePattern resources.
- * It reads the data from a file system and fills up a bunch of HashMaps
- * with their information.
+ * This class fills the role of a manager of all the RePattern resources. It reads the data from a file system and fills up a bunch of HashMaps with their information.
+ *
* @author jannik stroetgen
*
*/
public class RePatternManager extends GenericResourceManager {
+ /** Class logger */
+ private static final Logger LOG = LoggerFactory.getLogger(RePatternManager.class);
+
protected static HashMap instances = new HashMap();
-
+
// STORE PATTERNS AND NORMALIZATIONS
private TreeMap hmAllRePattern;
+ private HashMap compiled;
+
/**
- * Constructor calls the parent constructor that sets language/resource
- * parameters and collects resource repatterns.
+ * Constructor calls the parent constructor that sets language/resource parameters and collects resource repatterns.
+ *
* @param language
* @param load_temponym_resources
*/
@@ -36,6 +48,7 @@ private RePatternManager(String language, Boolean load_temponym_resources) {
super("repattern", language);
// initialize the member map of all repatterns
hmAllRePattern = new TreeMap();
+ compiled = new HashMap();
//////////////////////////////////////////////////////
// READ PATTERN RESOURCES FROM FILES AND STORE THEM //
@@ -50,145 +63,134 @@ private RePatternManager(String language, Boolean load_temponym_resources) {
/**
* singleton producer.
+ *
* @return singleton instance of RePatternManager
*/
public static RePatternManager getInstance(Language language, Boolean load_temponym_resources) {
- if(!instances.containsKey(language.getName())) {
+ if (!instances.containsKey(language.getName())) {
RePatternManager nm = new RePatternManager(language.getResourceFolder(), load_temponym_resources);
instances.put(language.getName(), nm);
}
-
+
return instances.get(language.getName());
}
-
-
+
/**
* READ THE REPATTERN FROM THE FILES. The files have to be defined in the HashMap hmResourcesRePattern.
- * @param hmResourcesRePattern RePattern resources to be interpreted
- * @param load_temponym_resources whether temponym resources are to be read
+ *
+ * @param hmResourcesRePattern
+ * RePattern resources to be interpreted
+ * @param load_temponym_resources
+ * whether temponym resources are to be read
*/
- private void readRePatternResources(ResourceMap hmResourcesRePattern, Boolean load_temponym_resources) {
-
+ private void readRePatternResources(ResourceMap hmResourcesRePattern, boolean load_temponym_resources) {
//////////////////////////////////////
// READ REGULAR EXPRESSION PATTERNS //
//////////////////////////////////////
- InputStream is = null;
- InputStreamReader isr = null;
- BufferedReader br = null;
- try {
- for (String resource : hmResourcesRePattern.keySet()) {
- // read pattern resources with "Temponym" only if temponym tagging is selected
- if ( (!(resource.contains("Temponym"))) ||
- ((load_temponym_resources) && (resource.contains("Temponym")))){
- Logger.printDetail(component, "Adding pattern resource: "+resource);
- // create a buffered reader for every repattern resource file
- is = hmResourcesRePattern.getInputStream(resource);
- isr = new InputStreamReader(is, "UTF-8");
- br = new BufferedReader(isr);
- LinkedList patterns = new LinkedList();
- for (String line; (line = br.readLine()) != null; ) {
- // disregard comments
- if (!line.startsWith("//") && !line.equals("")) {
- patterns.add(replaceSpaces(line));
- }
- }
-
-
-
- // sort the repatterns by length in ascending order
- Collections.sort(patterns, new Comparator() {
- @Override
- public int compare(String o1, String o2) {
- String o1effective = o1.replaceAll("\\[[^\\]]*\\]", "X")
- .replaceAll("\\?", "")
- .replaceAll("\\\\.(?:\\{([^\\}])+\\})?", "X$1");
- String o2effective = o2.replaceAll("\\[[^\\]]*\\]", "X")
- .replaceAll("\\?", "")
- .replaceAll("\\\\.(?:\\{([^\\}])+\\})?", "X$1");
-
- if(o1effective.length() < o2effective.length())
- return 1;
- else if(o1effective.length() > o2effective.length())
- return -1;
- else
- return 0;
- }
- });
-
- StringBuilder sb = new StringBuilder();
- String devPattern = "";
- for(String pat : patterns) {
- sb.append("|");
- sb.append(pat);
- }
- devPattern = sb.toString();
- hmAllRePattern.put(resource, devPattern);
- }
- else {
- Logger.printDetail(component, "No Temponym Tagging selected. Skipping pattern resource: "+resource);
- }
- }
- ////////////////////////////
- // FINALIZE THE REPATTERN //
- ////////////////////////////
- for (String which : hmAllRePattern.keySet()) {
- if ( (!(which.contains("Temponym"))) ||
- ((load_temponym_resources) && (which.contains("Temponym")))){
- finalizeRePattern(which, hmAllRePattern.get(which));
- }
+ for (String resource : hmResourcesRePattern.keySet()) {
+ // read pattern resources with "Temponym" only if temponym tagging is selected
+ if (!load_temponym_resources && resource.contains("Temponym")) {
+ LOG.trace("No Temponym tagging selected. Skipping pattern resource: {}", resource);
+ continue;
}
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- if(br != null) {
- br.close();
- }
- if(isr != null) {
- isr.close();
- }
- if(is != null) {
- is.close();
- }
- } catch(Exception e) {
- e.printStackTrace();
+ LOG.debug("Adding pattern resource: {}", resource);
+ // create a buffered reader for every repattern resource file
+ try (InputStream is = hmResourcesRePattern.getInputStream(resource); //
+ InputStreamReader isr = new InputStreamReader(is, "UTF-8"); //
+ BufferedReader br = new BufferedReader(isr)) {
+ List patterns = new ArrayList();
+ for (String line; (line = br.readLine()) != null;)
+ // disregard comments
+ if (!line.startsWith("//") && !line.equals(""))
+ patterns.add(line);
+ patterns = optimizePatterns(resource, patterns);
+ hmAllRePattern.put(resource, String.join("|", patterns));
+ } catch (IOException e) {
+ LOG.error(e.getMessage(), e);
}
}
}
-
+
/**
- * Pattern containing regular expression is finalized, i.e., created correctly and added to hmAllRePattern.
- * @param name key name
- * @param rePattern repattern value
+ * Optimize a set of patterns into a more efficient regexp, because of Java.
+ *
+ * @author Erich Schubert
+ * @param inpatterns
+ * Input patterns
+ * @return Optimized regular expression set
*/
- private void finalizeRePattern(String name, String rePattern) {
- // create correct regular expression
- rePattern = rePattern.replaceFirst("\\|", "");
- /* this was added to reduce the danger of getting unusable groups from user-made repattern
- * files with group-producing parentheses (i.e. "(foo|bar)" while matching against the documents. */
- rePattern = rePattern.replaceAll("\\(([^\\?])", "(?:$1");
- rePattern = "(" + rePattern + ")";
- rePattern = rePattern.replaceAll("\\\\", "\\\\\\\\");
- // add rePattern to hmAllRePattern
- hmAllRePattern.put(name, rePattern);
+ public static List optimizePatterns(CharSequence name, List inpatterns) {
+ // Since we already have some rules written as res,
+ // We try to expand some basic constructs first.
+ try {
+ ArrayList expanded = new ArrayList<>();
+ for (String s : inpatterns) {
+ try {
+ RegexpOptimizer.expandPatterns(s, x -> expanded.add(x.toString()));
+ } catch (OptimizerException e) {
+ // More specific message than below.
+ LOG.warn("Pattern '{}' for '{}' contains a too complex regexp construct, cannot optimize: {}", s, name, e.getMessage());
+ return inpatterns;
+ }
+ }
+ if (expanded.isEmpty()) {
+ LOG.info("Regexp pattern {} is empty.", name);
+ return Collections.emptyList();
+ }
+ String pattern = RegexpOptimizer.combinePatterns(expanded);
+ LOG.trace("Combined {} into: {}", name, pattern);
+ return Arrays.asList(pattern);
+ } catch (OptimizerException e) {
+ LOG.warn("Pattern '{}' contains a too complex regexp construct, cannot optimize: {}", name, e.getMessage());
+ return inpatterns;
+ }
}
-
+
/**
* proxy method to access the hmAllRePattern member
- * @param key key to check for
+ *
+ * @param key
+ * key to check for
* @return whether the map contains the key
*/
- public Boolean containsKey(String key) {
+ public boolean containsKey(String key) {
return hmAllRePattern.containsKey(key);
}
/**
* proxy method to access the hmAllRePattern member
- * @param key Key to retrieve data from
+ *
+ * @param key
+ * Key to retrieve data from
* @return String from the map
*/
public String get(String key) {
return hmAllRePattern.get(key);
}
+ /**
+ * proxy method to access the compiled hmAllRePattern member
+ *
+ * @param key
+ * Key to retrieve data from
+ * @return String from the map
+ */
+ public Pattern getCompiled(String key) {
+ Pattern p = compiled.get(key);
+ if (p != null)
+ return p;
+ String rePattern = hmAllRePattern.get(key);
+ try {
+ Pattern c = Pattern.compile(rePattern);
+ int groupcount = c.matcher("").groupCount();
+ if (groupcount != 0)
+ LOG.error("rePattern {} contains unexpected groups: {}\nPattern: {}", key, groupcount - 1, rePattern);
+ compiled.put(key, c);
+ return c;
+ } catch (PatternSyntaxException e) {
+ LOG.error("Failed to compile RePattern {}:\n{}", key, rePattern);
+ throw e;
+ }
+ }
+
}
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/RegexHashMap.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/RegexHashMap.java
index 8ffe5987..1a11e795 100644
--- a/src/de/unihd/dbs/uima/annotator/heideltime/resources/RegexHashMap.java
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/RegexHashMap.java
@@ -3,7 +3,6 @@
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
-import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
@@ -12,12 +11,10 @@
* Implements a HashMap extended with regular expression keys and caching functionality.
*
* @author Julian Zell
- *
*/
public class RegexHashMap implements Map {
-
- private HashMap container = new HashMap();
- private HashMap cache = new HashMap();
+ private HashMap container = new HashMap<>();
+ private HashMap cache = new HashMap<>();
/**
* clears both the container and the cache hashmaps
@@ -32,6 +29,8 @@ public void clear() {
* container's keys as regexes and checks whether they match the specific key.
*/
public boolean containsKey(Object key) {
+ if (!(key instanceof String))
+ return false;
// the key is a direct hit from our cache
if(cache.containsKey(key))
return true;
@@ -39,12 +38,11 @@ public boolean containsKey(Object key) {
if(container.containsKey(key))
return true;
+ String str = (String) key;
// check if the requested key is a matching string of a regex key from our container
- Iterator regexKeys = container.keySet().iterator();
- while(regexKeys.hasNext()) {
- if(Pattern.matches(regexKeys.next(), (String) key))
+ for(String regexKey : container.keySet())
+ if(Pattern.matches(regexKey, str))
return true;
- }
// if the three previous tests yield no result, the key does not exist
return false;
@@ -70,7 +68,7 @@ public boolean containsValue(Object value) {
*/
public Set> entrySet() {
// prepare the container
- HashSet> set = new HashSet>();
+ HashSet> set = new HashSet<>();
// add the set from our container
set.addAll(container.entrySet());
// add the set from our cache
@@ -88,26 +86,24 @@ public Set> entrySet() {
*/
public T get(Object key) {
// output for requested key null is the value null; normal Map behavior
- if(key == null) return null;
-
+ if(!(key instanceof String)) return null;
+
T result = null;
- if((result = cache.get(key)) != null) {
- // if the requested key maps to a value in the cache
+ // if the requested key maps to a value in the cache
+ if((result = cache.get(key)) != null)
return result;
- } else if((result = container.get(key)) != null) {
- // if the requested key maps to a value in the container
+
+ // if the requested key maps to a value in the container
+ if((result = container.get(key)) != null)
return result;
- } else {
- // check if the requested key is a matching string of a regex key from our container
- Iterator> regexKeys = container.entrySet().iterator();
- while(regexKeys.hasNext()) {
- // prepare current entry
- Entry entry = regexKeys.next();
- // check if the key is a regex matching the input key
- if(Pattern.matches(entry.getKey(), (String) key)) {
- putCache((String) key, entry.getValue());
- return entry.getValue();
- }
+
+ // check if the requested key is a matching string of a regex key from our container
+ String str = (String) key;
+ for (Entry entry : container.entrySet()) {
+ // check if the key is a regex matching the input key
+ if(Pattern.matches(entry.getKey(), str)) {
+ putCache(str, entry.getValue());
+ return entry.getValue();
}
}
@@ -127,7 +123,7 @@ public boolean isEmpty() {
*/
public Set keySet() {
// prepare container
- HashSet set = new HashSet();
+ HashSet set = new HashSet<>();
// add container keys
set.addAll(container.keySet());
// add cache keys
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/ResourceMap.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/ResourceMap.java
index 660f8ef8..19e86108 100644
--- a/src/de/unihd/dbs/uima/annotator/heideltime/resources/ResourceMap.java
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/ResourceMap.java
@@ -11,7 +11,7 @@
import java.util.Set;
import java.util.TreeSet;
-import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
+import org.slf4j.LoggerFactory;
public class ResourceMap implements Map {
HashMap outerFiles = new HashMap();
@@ -52,7 +52,7 @@ public InputStream getInputStream(String key) {
try {
is = new FileInputStream(outerFiles.get(key));
} catch(FileNotFoundException e) {
- Logger.printError("File " + key + " disppeared while loading resources.");
+ LoggerFactory.getLogger(ResourceMap.class).error("File " + key + " disppeared while loading resources.");
}
} else if(innerFiles.containsKey(key)) {
is = this.getClass().getClassLoader().getResourceAsStream(innerFiles.get(key));
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/ResourceScanner.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/ResourceScanner.java
index a71bcb3b..5b31e1d1 100644
--- a/src/de/unihd/dbs/uima/annotator/heideltime/resources/ResourceScanner.java
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/ResourceScanner.java
@@ -15,9 +15,13 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class ResourceScanner {
+ /** Class logger */
+ private static final Logger LOG = LoggerFactory.getLogger(ResourceScanner.class);
+
private static ResourceScanner INSTANCE = null;
/**
@@ -100,12 +104,10 @@ private ResourceScanner() {
public static void main(String[] args) {
- @SuppressWarnings("unused")
- ResourceScanner rs = null;
try {
- rs = new ResourceScanner();
+ new ResourceScanner();
} catch (Exception e) {
- e.printStackTrace();
+ LOG.error(e.getMessage(), e);
}
}
@@ -125,7 +127,7 @@ private void scanValidInsideResourcesFolder(HashMap jarContent
Pattern rulePattern = Pattern.compile(language + "/rules/resources_rules_(.+)\\.txt$");
if (entry.getValue().isDirectory()) {
- Logger.printDetail(ResourceScanner.class, "Testing " + entry.getKey());
+ LOG.trace("Testing {}", entry.getKey());
/*
* our conditions for something being a resources folder: the resource
* folder must contain at least the following folders:
@@ -136,9 +138,9 @@ private void scanValidInsideResourcesFolder(HashMap jarContent
* |- rules
*/
- Boolean repatternExists = false;
- Boolean normalizationExists = false;
- Boolean ruleExists = false;
+ boolean repatternExists = false;
+ boolean normalizationExists = false;
+ boolean ruleExists = false;
for(String entryName : jarContents.keySet()) {
if(!repatternExists && repatternPattern.matcher(entryName).matches()) {
@@ -153,11 +155,11 @@ private void scanValidInsideResourcesFolder(HashMap jarContent
}
if(!repatternExists || !normalizationExists || !ruleExists) {
- Logger.printDetail(ResourceScanner.class, "We need at least one readable resource file of each type to run.");
+ LOG.debug("We need at least one readable resource file of each type to run (in {})", entry.getKey());
continue;
}
- Logger.printDetail(ResourceScanner.class, "Valid resource folder.");
+ LOG.trace("Valid resource folder: {}", entry.getKey());
// at this point, the folder is obviously a language resource folder => collect streams
this.repatterns.put(language, new ResourceMap());
@@ -191,10 +193,10 @@ private void scanValidOutsideResourcesFolder(File resourcePath) {
for (File supposedLanguagePath : pathContents) {
String language = supposedLanguagePath.getName();
if (supposedLanguagePath.isDirectory()) {
- Logger.printDetail(ResourceScanner.class, "Testing " + supposedLanguagePath.getAbsolutePath());
+ LOG.trace("Testing {}", supposedLanguagePath);
if (!supposedLanguagePath.exists()) {
- Logger.printDetail(ResourceScanner.class, "This path doesn't exist.");
+ LOG.debug("This path doesn't exist.");
continue;
}
@@ -215,8 +217,7 @@ private void scanValidOutsideResourcesFolder(File resourcePath) {
if (!repatternFolder.exists() || !repatternFolder.canRead() || !repatternFolder.isDirectory()
|| !normalizationFolder.exists() || !normalizationFolder.canRead() || !normalizationFolder.isDirectory()
|| !ruleFolder.exists() || !ruleFolder.canRead() || !ruleFolder.isDirectory()) {
- Logger.printDetail(ResourceScanner.class, "We need at least the folders repattern, normalization and rules in this folder.");
-
+ LOG.debug("We need at least the folders repattern, normalization and rules in folder '{}'.", supposedLanguagePath);
continue;
}
@@ -244,11 +245,11 @@ public boolean accept(File arg0, String arg1) {
|| !repatternFiles[0].exists() || !repatternFiles[0].canRead() || !repatternFiles[0].isFile()
|| !normalizationFiles[0].exists() || !normalizationFiles[0].canRead() || !normalizationFiles[0].isFile()
|| !ruleFiles[0].exists() || !ruleFiles[0].canRead() || !ruleFiles[0].isFile()) {
- Logger.printDetail(ResourceScanner.class, "We need at least one readable resource file of each type to run.");
+ LOG.debug("We need at least one readable resource file of each type to run in '{}'", supposedLanguagePath);
continue;
}
- Logger.printDetail(ResourceScanner.class, "Valid resource folder.");
+ LOG.trace("Valid resource folder: {}", supposedLanguagePath);
// at this point, the folder is obviously a language resource folder => collect streams
this.repatterns.put(language, new ResourceMap());
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/Rule.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/Rule.java
new file mode 100644
index 00000000..616a3a23
--- /dev/null
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/Rule.java
@@ -0,0 +1,98 @@
+package de.unihd.dbs.uima.annotator.heideltime.resources;
+
+import java.util.regex.Pattern;
+
+/**
+ * Class representing a single rule.
+ *
+ * @author Erich Schubert
+ */
+public class Rule implements Comparable {
+ /**
+ * Constructor with mandatory parameters.
+ *
+ * @param name Name
+ * @param pattern Pattern
+ * @param normalization Normalization
+ */
+ public Rule(String name, Pattern pattern, String normalization) {
+ this.name = name;
+ this.pattern = pattern;
+ this.normalization = normalization;
+ }
+
+ /** Rule name */
+ String name;
+
+ /** Extraction pattern */
+ Pattern pattern;
+
+ /** Normalization */
+ String normalization;
+
+ /** Offset pattern*/
+ String offset;
+
+ /** Quant */
+ String quant;
+
+ /** Freq */
+ String freq;
+
+ /** Mod */
+ String mod;
+
+ /** Position constraint */
+ String posConstratint;
+
+ /** Empty value */
+ String emptyValue;
+
+ /** Fast check */
+ Pattern fastCheck;
+
+ public String getName() {
+ return name;
+ }
+
+ public Pattern getPattern() {
+ return pattern;
+ }
+
+ public String getNormalization() {
+ return normalization;
+ }
+
+ public String getOffset() {
+ return offset;
+ }
+
+ public String getQuant() {
+ return quant;
+ }
+
+ public String getFreq() {
+ return freq;
+ }
+
+ public String getMod() {
+ return mod;
+ }
+
+ public String getPosConstratint() {
+ return posConstratint;
+ }
+
+ public String getEmptyValue() {
+ return emptyValue;
+ }
+
+ public Pattern getFastCheck() {
+ return fastCheck;
+ }
+
+ @Override
+ public int compareTo(Rule other) {
+ return name.compareTo(other.name);
+ }
+}
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/RuleExpansion.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/RuleExpansion.java
new file mode 100644
index 00000000..12988302
--- /dev/null
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/RuleExpansion.java
@@ -0,0 +1,212 @@
+package de.unihd.dbs.uima.annotator.heideltime.resources;
+
+import java.util.regex.MatchResult;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import de.unihd.dbs.uima.annotator.heideltime.utilities.ChineseNumbers;
+
+/**
+ * HeidelTime rule expansion logic.
+ *
+ * There is some copy and paste involved in the {@code expandX} functions, but this allows the hotspot VM to optimize them independently.
+ *
+ * This should probably be integrated into the {@link Rule} class, and only some expansions are necessary.
+ *
+ * @author Erich Schubert
+ */
+public class RuleExpansion {
+ /** Class logger */
+ private static final Logger LOG = LoggerFactory.getLogger(RuleExpansion.class);
+
+ static Pattern paNorm = Pattern.compile("%([A-Za-z0-9]+?)\\(group\\(([0-9]+)\\)\\)");
+ static Pattern paGroup = Pattern.compile("group\\(([0-9]+)\\)");
+ static Pattern paSubstring = Pattern.compile("%SUBSTRING%\\((.*?),([0-9]+),([0-9]+)\\)");
+ static Pattern paLowercase = Pattern.compile("%LOWERCASE%\\((.*?)\\)");
+ static Pattern paUppercase = Pattern.compile("%UPPERCASE%\\((.*?)\\)");
+ static Pattern paSum = Pattern.compile("%SUM%\\((.*?),(.*?)\\)");
+ static Pattern paNormNoGroup = Pattern.compile("%([A-Za-z0-9]+?)\\((.*?)\\)");
+ static Pattern paChineseNorm = Pattern.compile("%CHINESENUMBERS%\\((.*?)\\)");
+ static Pattern WHITESPACE_NORM = Pattern.compile("[\n\\s]+");
+
+ public static String applyRuleFunctions(String rule, String pattern, MatchResult m, NormalizationManager norm, Language language) {
+ StringBuilder tonormalize = new StringBuilder(pattern);
+ // pattern for normalization functions + group information
+ // pattern for group information
+ Matcher mr = paNorm.matcher(tonormalize);
+ while (tonormalize.indexOf("%") >= 0 || tonormalize.indexOf("group") >= 0) {
+ // replace normalization functions
+ expandNormalizationGroup(tonormalize, mr, norm, m, rule);
+ // replace other groups
+ expandGroups(tonormalize, mr, m, rule);
+ // apply the substring function
+ expandSubstringFunction(tonormalize, mr, m, rule);
+ if (language.useLowercase()) {
+ expandLowerCaseFunction(tonormalize, mr);
+ expandUpperCaseFunction(tonormalize, mr);
+ }
+ // replace sum, concatenation
+ expandSumFunction(tonormalize, mr, m, rule);
+ // replace normalization function without group
+ expandNormalizationFull(tonormalize, mr, norm, rule);
+ // replace Chinese with Arabic numerals
+ replaceChineseNumerals(tonormalize, mr);
+ }
+ return tonormalize.toString();
+ }
+
+ private static void expandNormalizationGroup(StringBuilder tonormalize, Matcher mr, NormalizationManager norm, MatchResult m, String rule) {
+ mr.usePattern(paNorm).reset(tonormalize);
+ for (int pos = 0; mr.find(pos);) {
+ String normfunc = mr.group(1);
+ int start = mr.start(), end = mr.end();
+ int groupid = Integer.parseInt(mr.group(2));
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("rule:" + rule);
+ LOG.trace("tonormalize:" + tonormalize.toString());
+ LOG.trace("x.group():" + mr.group());
+ LOG.trace("x.group(1):" + normfunc);
+ LOG.trace("x.group(2):" + mr.group(2));
+ LOG.trace("m.group():" + m.group());
+ LOG.trace("m.group(" + groupid + "):" + m.group(groupid));
+ LOG.trace("hmR...:" + norm.getFromHmAllNormalization(normfunc).get(m.group(groupid)));
+ }
+
+ if (groupid > m.groupCount()) {
+ LOG.error("Invalid group reference '{}' in normalization pattern of rule: {}", groupid, rule);
+ tonormalize.delete(start, end);
+ continue;
+ }
+ String value = m.group(groupid);
+ if (value == null) {
+ // This is not unusual to happen
+ LOG.debug("Empty part to normalize in {}, rule {}, '{}'", normfunc, rule, m.group());
+ tonormalize.delete(start, end);
+ continue;
+ }
+ value = WHITESPACE_NORM.matcher(value).replaceAll(" ");
+ RegexHashMap normmap = norm.getFromHmAllNormalization(normfunc);
+ String rep = normmap != null ? normmap.get(value) : null;
+ if (rep == null) {
+ if (normfunc.contains("Temponym")) {
+ LOG.debug("Temponym '{}' normalization problem. Value: {} in " + //
+ "rule: {} tonormalize: {}", normfunc, value, rule, tonormalize);
+ tonormalize.delete(start, end);
+ continue;
+ }
+ LOG.warn("'{}' normalization problem. Value: {} in " + //
+ "rule: {} tonormalize: {}", normfunc, value, rule, tonormalize);
+ tonormalize.delete(start, end);
+ continue;
+ }
+ tonormalize.replace(start, end, rep);
+ pos = start + rep.length();
+ }
+ }
+
+ private static void expandGroups(StringBuilder tonormalize, Matcher mr, MatchResult m, String rule) {
+ mr.usePattern(paGroup).reset(tonormalize);
+ for (int pos = 0; mr.find(pos);) {
+ int groupid = Integer.parseInt(mr.group(1));
+ int start = mr.start(), end = mr.end();
+ if (groupid > m.groupCount()) {
+ LOG.error("Invalid group reference '{}' in normalization pattern of rule: {}", groupid, rule);
+ tonormalize.delete(start, end);
+ continue;
+ }
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("tonormalize:" + tonormalize);
+ LOG.trace("x.group():" + mr.group());
+ LOG.trace("x.group(1):" + mr.group(1));
+ LOG.trace("m.group():" + mr.group());
+ LOG.trace("m.group(" + mr.group(1) + "):" + m.group(groupid));
+ }
+ String rep = m.group(groupid);
+ tonormalize.replace(start, end, rep);
+ pos = start + rep.length();
+ }
+ }
+
+ private static void expandNormalizationFull(StringBuilder tonormalize, Matcher mr, NormalizationManager norm, String rule) {
+ mr.usePattern(paNormNoGroup).reset(tonormalize);
+ int pos = 0;
+ while (mr.find(pos)) {
+ String normfunc = mr.group(1);
+ String value = mr.group(2);
+ String rep = norm.getFromHmAllNormalization(normfunc).get(value);
+ if (rep == null) {
+ LOG.warn("'{}' normalization problem. Value: {} in " + //
+ "rule: {} tonormalize: {}", normfunc, value, rule, tonormalize);
+ rep = "";
+ }
+ int start = mr.start(), end = mr.end();
+ tonormalize.replace(start, end, rep);
+ pos = start + rep.length();
+ }
+ }
+
+ private static void expandSubstringFunction(StringBuilder tonormalize, Matcher mr, MatchResult m, String rule) {
+ mr.usePattern(paSubstring).reset(tonormalize);
+ for (int pos = 0; mr.find(pos);) {
+ int start = mr.start(), end = mr.end();
+ try {
+ String rep = mr.group(1).substring(Integer.parseInt(mr.group(2)), Integer.parseInt(mr.group(3)));
+ tonormalize.replace(start, end, rep);
+ pos = start + rep.length();
+ } catch (StringIndexOutOfBoundsException e) {
+ LOG.error("Substring out of bounds: '{}' for '{}' with rule '{}'", mr.group(), m.group(), rule, e);
+ tonormalize.delete(start, end);
+ }
+ }
+ }
+
+ private static void expandLowerCaseFunction(StringBuilder tonormalize, Matcher mr) {
+ mr.usePattern(paLowercase).reset(tonormalize);
+ for (int pos = 0; mr.find(pos);) {
+ String rep = mr.group(1).toLowerCase();
+ int start = mr.start(), end = mr.end();
+ tonormalize.replace(start, end, rep);
+ pos = start + rep.length();
+ }
+ }
+
+ private static void expandUpperCaseFunction(StringBuilder tonormalize, Matcher mr) {
+ mr.usePattern(paUppercase).reset(tonormalize);
+ for (int pos = 0; mr.find(pos);) {
+ String rep = mr.group(1).toUpperCase();
+ int start = mr.start(), end = mr.end();
+ tonormalize.replace(start, end, rep);
+ pos = start + rep.length();
+ }
+ }
+
+ private static void expandSumFunction(StringBuilder tonormalize, Matcher mr, MatchResult m, String rule) {
+ mr.usePattern(paSum).reset(tonormalize);
+ for (int pos = 0; mr.find(pos);) {
+ int start = mr.start(), end = mr.end();
+ try {
+ String rep = Integer.toString(Integer.parseInt(mr.group(1)) + Integer.parseInt(mr.group(2)));
+ tonormalize.replace(start, end, rep);
+ pos = start + rep.length();
+ } catch (NumberFormatException e) {
+ LOG.error("Failed to expand sum: '{}' for '{}' with rule '{}'", mr.group(), m.group(), rule, e);
+ tonormalize.delete(start, end);
+ }
+ }
+ }
+
+ private static void replaceChineseNumerals(StringBuilder tonormalize, Matcher mr) {
+ mr.usePattern(paChineseNorm).reset(tonormalize);
+ for (int pos = 0; mr.find(pos);) {
+ String rep = ChineseNumbers.normalize(mr.group(1));
+ if (rep == null) // TODO: Add a warning
+ continue;
+ int start = mr.start(), end = mr.end();
+ tonormalize.replace(start, end, rep);
+ pos = start + rep.length();
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/de/unihd/dbs/uima/annotator/heideltime/resources/RuleManager.java b/src/de/unihd/dbs/uima/annotator/heideltime/resources/RuleManager.java
index 89f96d28..0e34b48a 100644
--- a/src/de/unihd/dbs/uima/annotator/heideltime/resources/RuleManager.java
+++ b/src/de/unihd/dbs/uima/annotator/heideltime/resources/RuleManager.java
@@ -4,106 +4,77 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
-import java.util.Collections;
-import java.util.Comparator;
+import java.util.ArrayList;
import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.regex.MatchResult;
+import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
-import de.unihd.dbs.uima.annotator.heideltime.utilities.Toolbox;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
- *
- * This class fills the role of a manager of all the rule resources. It reads
- * the data from a file system and fills up a bunch of HashMaps with their
- * information.
+ * This class fills the role of a manager of all the rule resources. It reads the data from a file system and fills up a bunch of HashMaps with their information.
*
* @author jannik stroetgen
- *
*/
public class RuleManager extends GenericResourceManager {
- protected static HashMap instances = new HashMap();
+ /** Class logger */
+ private static final Logger LOG = LoggerFactory.getLogger(RuleManager.class);
- // PATTERNS TO READ RESOURCES "RULES" AND "NORMALIZATION"
- Pattern paReadRules = Pattern.compile("RULENAME=\"(.*?)\",EXTRACTION=\"(.*?)\",NORM_VALUE=\"(.*?)\"(.*)");
-
- // EXTRACTION PARTS OF RULES (patterns loaded from files)
- HashMap hmDatePattern = new HashMap();
- HashMap hmDurationPattern = new HashMap();
- HashMap hmTimePattern = new HashMap();
- HashMap hmSetPattern = new HashMap();
-
- // NORMALIZATION PARTS OF RULES (patterns loaded from files)
- HashMap hmDateNormalization = new HashMap();
- HashMap hmTimeNormalization = new HashMap();
- HashMap hmDurationNormalization = new HashMap();
- HashMap hmSetNormalization = new HashMap();
+ /** Static pool */
+ protected static HashMap instances = new HashMap();
- // OFFSET PARTS OF RULES (patterns loaded from files)
- HashMap hmDateOffset = new HashMap();
- HashMap hmTimeOffset = new HashMap();
- HashMap hmDurationOffset = new HashMap();
- HashMap hmSetOffset = new HashMap();
+ /**
+ * singleton producer.
+ *
+ * @return singleton instance of RuleManager
+ */
+ public static RuleManager getInstance(Language language, boolean load_temponym_resources) {
+ RuleManager rm = instances.get(language.getName());
+ if (rm != null)
+ return rm;
+ synchronized (RuleManager.class) {
+ rm = instances.get(language.getName());
+ if (rm != null)
+ return rm;
+ rm = new RuleManager(language.getResourceFolder(), load_temponym_resources);
+ instances.put(language.getName(), rm);
+ return rm;
+ }
+ }
- // QUANT PARTS OF RULES (patterns loaded from files)
- HashMap hmDateQuant = new HashMap();
- HashMap hmTimeQuant = new HashMap();
- HashMap hmDurationQuant = new HashMap();
- HashMap hmSetQuant = new HashMap();
+ /**
+ * Exception thrown when a pattern could not be built.
+ */
+ public static class InvalidPatternException extends RuntimeException {
+ private static final long serialVersionUID = 1L;
- // FREQ PARTS OF RULES (patterns loaded from files)
- HashMap hmDateFreq = new HashMap();
- HashMap hmTimeFreq = new HashMap();
- HashMap hmDurationFreq = new HashMap();
- HashMap hmSetFreq = new HashMap();
+ public InvalidPatternException(String msg) {
+ super(msg);
+ }
- // MOD PARTS OF RULES (patterns loaded from files)
- HashMap hmDateMod = new HashMap();
- HashMap hmTimeMod = new HashMap();
- HashMap hmDurationMod = new HashMap();
- HashMap hmSetMod = new HashMap();
+ public InvalidPatternException(String msg, Throwable cause) {
+ super(msg, cause);
+ }
+ }
- // POS PARTS OF RULES (patterns loaded from files)
- HashMap hmDatePosConstraint = new HashMap();
- HashMap hmTimePosConstraint = new HashMap();
- HashMap hmDurationPosConstraint = new HashMap();
- HashMap hmSetPosConstraint = new HashMap();
-
- // EMPTYVALUE part of rules
- HashMap hmDateEmptyValue = new HashMap();
- HashMap hmTimeEmptyValue = new HashMap();
- HashMap hmDurationEmptyValue = new HashMap();
- HashMap