From 09e91d53c4d26a1e1781a07f31eab8a0ee884e4e Mon Sep 17 00:00:00 2001 From: Ravf95 Date: Mon, 22 Feb 2021 09:54:53 -0300 Subject: [PATCH 01/10] updated summary extraction --- parser/extract/summary.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/parser/extract/summary.go b/parser/extract/summary.go index 6a0ccbd..4976c9f 100644 --- a/parser/extract/summary.go +++ b/parser/extract/summary.go @@ -2,6 +2,7 @@ package extract import ( "github.com/InstIDEA/ddjj/parser/declaration" + "strings" "fmt" ) @@ -17,6 +18,13 @@ func Summary(e *Extractor, parser *ParserData) *declaration.Summary { break } + if strings.Contains(e.CurrToken, "página") && + !hasLeadingSpaces(e.CurrToken, "página") && + !hasTrailingSpaces(e.CurrToken, "página") { + e.MoveUntilContains(CurrToken, "DECLARACIÓN") + continue + } + if isNumber(e.CurrToken) { results[index] = StringToInt64(e.CurrToken) index++ From a8960c0edbeba8339bec306e95f9917fcd207b85 Mon Sep 17 00:00:00 2001 From: Ravf95 Date: Tue, 23 Feb 2021 16:10:36 -0300 Subject: [PATCH 02/10] added control for summary null value --- parser/extract/parser.go | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/parser/extract/parser.go b/parser/extract/parser.go index aa28f2b..9448495 100644 --- a/parser/extract/parser.go +++ b/parser/extract/parser.go @@ -172,16 +172,18 @@ func ParsePDF(file io.Reader) ParserData { d.CalculatePatrimony() - if d.Assets != d.Resumen.TotalActivo { - parser.addMessage("calculated assets and summary assets does not match") - } - - if d.Liabilities != d.Resumen.TotalPasivo { - parser.addMessage("calculated liabilities and summary liabilities does not match") - } - - if d.NetPatrimony != d.Resumen.PatrimonioNeto { - parser.addMessage("calculated net patrimony and summary net patrimony does not match") + if d.Resumen != nil { + if d.Assets != d.Resumen.TotalActivo { + parser.addMessage("calculated assets and summary assets does not match") + } + + if d.Liabilities != d.Resumen.TotalPasivo { + parser.addMessage("calculated liabilities and summary liabilities does not match") + } + + if d.NetPatrimony != d.Resumen.PatrimonioNeto { + parser.addMessage("calculated net patrimony and summary net patrimony does not match") + } } parser.Data = d From 3751e54c0a5c4eae7f0714fe61082959aa94f80a Mon Sep 17 00:00:00 2001 From: Ravf95 Date: Wed, 24 Feb 2021 15:29:16 -0300 Subject: [PATCH 03/10] countries cache implementation --- parser/extract/extractor.go | 26 ---------------------- parser/extract/state.go | 44 ++++++++++++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 27 deletions(-) diff --git a/parser/extract/extractor.go b/parser/extract/extractor.go index 0858500..9c64d70 100644 --- a/parser/extract/extractor.go +++ b/parser/extract/extractor.go @@ -2,7 +2,6 @@ package extract import ( "bufio" - "net/http" "regexp" "strconv" "strings" @@ -276,8 +275,6 @@ use extractor struct and methods instead the extractions that using these functions will be reviewed */ -var countries = map[string]bool{} - // MoveUntil finds a word and stops the scan there. func MoveUntil(scanner *bufio.Scanner, search string, exact bool) *bufio.Scanner { for scanner.Scan() { @@ -338,26 +335,3 @@ func isBarCode(line string) bool { matched, _ := regexp.MatchString(`[0-9]{5,6}-[0-9]{5,7}-[0-9]{1,3}`, line) return matched } - -func isCountry(line string) bool { - - if _, ok := countries[line]; ok { - return true - } - - resp, err := http.Get("https://restcountries.eu/rest/v2/name/" + line) - - if err != nil { - return false - } - - defer resp.Body.Close() - - if resp.StatusCode == 404 { - return false - } - - countries[line] = true - - return true -} \ No newline at end of file diff --git a/parser/extract/state.go b/parser/extract/state.go index 0d46fe0..e80f4b0 100644 --- a/parser/extract/state.go +++ b/parser/extract/state.go @@ -9,6 +9,8 @@ import ( "github.com/InstIDEA/ddjj/parser/declaration" ) +var countryCache = map[string]bool{} + var stateTwoLines = []string{ "EXPLOTACION", "TERRENO SIN", @@ -48,6 +50,8 @@ func RealStates(scanner *bufio.Scanner) ([]*declaration.RealState, error) { index := 0 stateItemNumber = 1 + countryCache = getCountryCache() + // Also wants to skip item number skipState = append(skipState, strconv.Itoa(stateItemNumber)) @@ -93,7 +97,7 @@ func RealStates(scanner *bufio.Scanner) ([]*declaration.RealState, error) { func getState(scanner *bufio.Scanner, values [11]string) []*declaration.RealState { // Casos 1, 4, 5. - if isCountry(values[0]) { + if countryCache[removeAccents(strings.ToUpper(values[0]))] { // En el caso 1, el valor en el último index es el tipo de adquisición. if !isNumber(values[10]) { return getState1(values) @@ -312,3 +316,41 @@ func addRealState(states []*declaration.RealState) int64 { return total } + +func getCountryCache() map[string]bool { + // countries + // extracted from https://www.ine.es/daco/daco42/clasificaciones/paises_estandar.xls + countries := [237]string{"AFGANISTAN","ALBANIA","ALEMANIA","ANDORRA","ANGOLA","ANGUILLA","ANTIGUA Y BARBUDA","ANTILLAS HOLANDESAS", + "ARABIA SAUDI","ARGELIA","ARGENTINA","ARMENIA","ARUBA","AUSTRALIA","AUSTRIA","AZERBAIYAN","BAHAMAS","BAHREIN","BANGLADESH","BARBADOS", + "BELARUS","BELGICA","BELICE","BENIN","BERMUDAS","BHUTAN","BOLIVIA","BOSNIA Y HERZEGOVINA","BOTSWANA","BRASIL","BRUNEI", + "BULGARIA","BURKINA FASO","BURUNDI","CABO VERDE","CAMBOYA","CAMERUN","CANADA","CHAD","CHILE","CHINA","CHIPRE","COLOMBIA", + "COMORES","CONGO","COREA","COREA DEL NORTE","COSTA DE MARFIL","COSTA RICA","CROACIA","CUBA","DINAMARCA","DJIBOUTI", + "DOMINICA","ECUADOR","EGIPTO","EL SALVADOR","EMIRATOS ARABES UNIDOS","ERITREA","ESLOVENIA","ESPAÑA","ESTADOS UNIDOS DE AMERICA", + "ESTONIA","ETIOPIA","FIJI","FILIPINAS","FINLANDIA","FRANCIA","GABON","GAMBIA","GEORGIA","GHANA","GIBRALTAR","GRANADA","GRECIA", + "GROENLANDIA","GUADALUPE","GUAM","GUATEMALA","GUAYANA FRANCESA","GUERNESEY","GUINEA","GUINEA ECUATORIAL","GUINEA-BISSAU","GUYANA", + "HAITI","HONDURAS","HONG KONG","HUNGRIA","INDIA","INDONESIA","IRAN","IRAQ","IRLANDA","ISLA DE MAN","ISLA NORFOLK","ISLANDIA", + "ISLAS ALAND","ISLAS CAIMAN","ISLAS COOK","ISLAS DEL CANAL","ISLAS FEROE","ISLAS MALVINAS","ISLAS MARIANAS DEL NORTE", + "ISLAS MARSHALL","ISLAS PITCAIRN","ISLAS SALOMON","ISLAS TURCAS Y CAICOS","ISLAS VIRGENES BRITANICAS", + "ISLAS VIRGENES DE LOS ESTADOS UNIDOS","ISRAEL","ITALIA","JAMAICA","JAPON","JERSEY","JORDANIA","KAZAJSTAN","KENIA","KIRGUISTAN", + "KIRIBATI","KUWAIT","LAOS","LESOTHO","LETONIA","LIBANO","LIBERIA","LIBIA","LIECHTENSTEIN","LITUANIA","LUXEMBURGO","MACAO","MACEDONIA", + "MADAGASCAR","MALASIA","MALAWI","MALDIVAS","MALI","MALTA","MARRUECOS","MARTINICA","MAURICIO","MAURITANIA","MAYOTTE","MEXICO","MICRONESIA", + "MOLDAVIA","MONACO","MONGOLIA","MONTENEGRO","MONTSERRAT","MOZAMBIQUE","MYANMAR","NAMIBIA","NAURU","NEPAL","NICARAGUA","NIGER","NIGERIA","NIUE", + "NORUEGA","NUEVA CALEDONIA","NUEVA ZELANDA","OMAN","PAISES BAJOS","PAKISTAN","PALAOS","PALESTINA","PANAMA","PAPUA NUEVA GUINEA","PARAGUAY", + "PERU","POLINESIA FRANCESA","POLONIA","PORTUGAL","PUERTO RICO","QATAR","REINO UNIDO","REP.DEMOCRATICA DEL CONGO","REPUBLICA CENTROAFRICANA", + "REPUBLICA CHECA","REPUBLICA DOMINICANA","REPUBLICA ESLOVACA","REUNION","RUANDA","RUMANIA","RUSIA","SAHARA OCCIDENTAL","SAMOA", + "SAMOA AMERICANA","SAN BARTOLOME","SAN CRISTOBAL Y NIEVES","SAN MARINO","SAN MARTIN (PARTE FRANCESA)","SAN PEDRO Y MIQUELON", + "SAN VICENTE Y LAS GRANADINAS","SANTA HELENA","SANTA LUCIA","SANTA SEDE","SANTO TOME Y PRINCIPE","SENEGAL","SERBIA","SEYCHELLES", + "SIERRA LEONA","SINGAPUR","SIRIA","SOMALIA","SRI LANKA","SUDAFRICA","SUDAN","SUECIA","SUIZA","SURINAM","SVALBARD Y JAN MAYEN", + "SWAZILANDIA","TADYIKISTAN","TAILANDIA","TANZANIA","TIMOR ORIENTAL","TOGO","TOKELAU","TONGA","TRINIDAD Y TOBAGO","TUNEZ", + "TURKMENISTAN","TURQUIA","TUVALU","UCRANIA","UGANDA","URUGUAY","UZBEKISTAN","VANUATU","VENEZUELA","VIETNAM","WALLIS Y FORTUNA", + "YEMEN","ZAMBIA","ZIMBABWE",} + + cache := map[string]bool{} + + for _, item := range countries { + cache[item] = true + } + + return cache +} + From 1a72007c9f767bf28e6469b2a6313a80b96a7015 Mon Sep 17 00:00:00 2001 From: Ravf95 Date: Wed, 24 Feb 2021 15:35:18 -0300 Subject: [PATCH 04/10] rename countryCache to countriesCache --- parser/extract/state.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parser/extract/state.go b/parser/extract/state.go index e80f4b0..a4ebff7 100644 --- a/parser/extract/state.go +++ b/parser/extract/state.go @@ -9,7 +9,7 @@ import ( "github.com/InstIDEA/ddjj/parser/declaration" ) -var countryCache = map[string]bool{} +var countriesCache = map[string]bool{} var stateTwoLines = []string{ "EXPLOTACION", @@ -50,7 +50,7 @@ func RealStates(scanner *bufio.Scanner) ([]*declaration.RealState, error) { index := 0 stateItemNumber = 1 - countryCache = getCountryCache() + countriesCache = getCountryCache() // Also wants to skip item number skipState = append(skipState, strconv.Itoa(stateItemNumber)) @@ -97,7 +97,7 @@ func RealStates(scanner *bufio.Scanner) ([]*declaration.RealState, error) { func getState(scanner *bufio.Scanner, values [11]string) []*declaration.RealState { // Casos 1, 4, 5. - if countryCache[removeAccents(strings.ToUpper(values[0]))] { + if countriesCache[removeAccents(strings.ToUpper(values[0]))] { // En el caso 1, el valor en el último index es el tipo de adquisición. if !isNumber(values[10]) { return getState1(values) From ed3e52571bae508c95830136ebd5905b84aa9890 Mon Sep 17 00:00:00 2001 From: Ravf95 Date: Wed, 24 Feb 2021 15:38:44 -0300 Subject: [PATCH 05/10] rename getCountryCache to getCountriesCache --- parser/extract/state.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parser/extract/state.go b/parser/extract/state.go index a4ebff7..eacb054 100644 --- a/parser/extract/state.go +++ b/parser/extract/state.go @@ -50,7 +50,7 @@ func RealStates(scanner *bufio.Scanner) ([]*declaration.RealState, error) { index := 0 stateItemNumber = 1 - countriesCache = getCountryCache() + countriesCache = getCountriesCache() // Also wants to skip item number skipState = append(skipState, strconv.Itoa(stateItemNumber)) @@ -317,7 +317,7 @@ func addRealState(states []*declaration.RealState) int64 { return total } -func getCountryCache() map[string]bool { +func getCountriesCache() map[string]bool { // countries // extracted from https://www.ine.es/daco/daco42/clasificaciones/paises_estandar.xls countries := [237]string{"AFGANISTAN","ALBANIA","ALEMANIA","ANDORRA","ANGOLA","ANGUILLA","ANTIGUA Y BARBUDA","ANTILLAS HOLANDESAS", From 2d6d41a369dba7388e6a8f2476658ead9771b803 Mon Sep 17 00:00:00 2001 From: Ravf95 Date: Tue, 2 Mar 2021 15:33:54 -0300 Subject: [PATCH 06/10] support for -layout option --- parser/extract/parser.go | 2 +- parser/go.mod | 6 +++--- parser/go.sum | 30 ++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/parser/extract/parser.go b/parser/extract/parser.go index 9448495..c2d2b04 100644 --- a/parser/extract/parser.go +++ b/parser/extract/parser.go @@ -77,7 +77,7 @@ func ParsePDF(file io.Reader) ParserData { Data: nil, Raw: make([]string, 0), } - res, err := docconv.Convert(file, "application/pdf", true) + res, err := docconv.Convert(file, "application/pdf", false) if err != nil { parser.addError(err) diff --git a/parser/go.mod b/parser/go.mod index dbe9ccc..5276c79 100644 --- a/parser/go.mod +++ b/parser/go.mod @@ -4,11 +4,11 @@ go 1.15 require ( code.sajari.com/docconv v1.1.0 - github.com/JalfResi/justext v0.0.0-20170829062021-c0282dea7198 // indirect github.com/advancedlogic/GoOse v0.0.0-20200830213114-1225d531e0ad // indirect github.com/golang/protobuf v1.4.3 // indirect - github.com/levigross/exp-html v0.0.0-20120902181939-8df60c69a8f5 // indirect - github.com/pkg/errors v0.8.1 + github.com/pkg/errors v0.9.1 github.com/sirupsen/logrus v1.7.0 golang.org/x/net v0.0.0-20210119194325-5f4716e94777 // indirect ) + +replace code.sajari.com/docconv v1.1.0 => github.com/Ravf95/docconv v1.2.0 diff --git a/parser/go.sum b/parser/go.sum index 0d6616f..abf1cb9 100644 --- a/parser/go.sum +++ b/parser/go.sum @@ -4,12 +4,22 @@ github.com/JalfResi/justext v0.0.0-20170829062021-c0282dea7198 h1:8P+AjBhGByCuCX github.com/JalfResi/justext v0.0.0-20170829062021-c0282dea7198/go.mod h1:0SURuH1rsE8aVWvutuMZghRNrNrYEUzibzJfhEYR8L0= github.com/PuerkitoBio/goquery v1.4.1 h1:smcIRGdYm/w7JSbcdeLHEMzxmsBQvl8lhf0dSw2nzMI= github.com/PuerkitoBio/goquery v1.4.1/go.mod h1:T9ezsOHcCrDCgA8aF1Cqr3sSYbO/xgdy8/R/XiIMAhA= +github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= +github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= +github.com/Ravf95/docconv v1.2.0 h1:pIAaPPt4OJlAm91axDJRbKdP5NjiEzdBMfdfUMHVgG4= +github.com/Ravf95/docconv v1.2.0/go.mod h1:DooS873W9YwUjTwEYGpg55aDlvnx1VcEdr7IJ9rEW8g= +github.com/advancedlogic/GoOse v0.0.0-20191112112754-e742535969c1/go.mod h1:f3HCSN1fBWjcpGtXyM119MJgeQl838v6so/PQOqvE1w= github.com/advancedlogic/GoOse v0.0.0-20200830213114-1225d531e0ad h1:gyzOmx++wVkSj5kLzYtvNN2ooeJGTFTtV37t5Do4sdM= github.com/advancedlogic/GoOse v0.0.0-20200830213114-1225d531e0ad/go.mod h1:f3HCSN1fBWjcpGtXyM119MJgeQl838v6so/PQOqvE1w= github.com/andybalholm/cascadia v1.0.0 h1:hOCXnnZ5A+3eVDX8pvgl4kofXv2ELss0bKcqRySc45o= github.com/andybalholm/cascadia v1.0.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= +github.com/andybalholm/cascadia v1.2.0 h1:vuRCkM5Ozh/BfmsaTm26kbjm0mIOM3yS5Ek/F5h18aE= +github.com/andybalholm/cascadia v1.2.0/go.mod h1:YCyR8vOZT9aZ1CHEd8ap0gMVm2aFgxBp0T0eFw1RUQY= github.com/araddon/dateparse v0.0.0-20180729174819-cfd92a431d0e h1:s05JG2GwtJMHaPcXDpo4V35TFgyYZzNsmBlSkHPEbeg= github.com/araddon/dateparse v0.0.0-20180729174819-cfd92a431d0e/go.mod h1:SLqhdZcd+dF3TEVL2RMoob5bBP5R1P1qkox+HtCBgGI= +github.com/araddon/dateparse v0.0.0-20200409225146-d820a6159ab1 h1:TEBmxO80TM04L8IuMWk77SGL1HomBmKTdzdJLLWznxI= +github.com/araddon/dateparse v0.0.0-20200409225146-d820a6159ab1/go.mod h1:SLqhdZcd+dF3TEVL2RMoob5bBP5R1P1qkox+HtCBgGI= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/fatih/set v0.2.1 h1:nn2CaJyknWE/6txyUDGwysr3G5QC6xWB/PtVjPBbeaA= @@ -18,11 +28,14 @@ github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 h1:u8AQ9bPa9oC+8 github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573/go.mod h1:eBvb3i++NHDH4Ugo9qCvMw8t0mTSctaEa5blJbWcNxs= github.com/go-resty/resty/v2 v2.0.0 h1:9Nq/U+V4xsoDnDa/iTrABDWUCuk3Ne92XFHPe6dKWUc= github.com/go-resty/resty/v2 v2.0.0/go.mod h1:dZGr0i9PLlaaTD4H/hoZIDjQ+r6xq8mgbRzHZf7f2J8= +github.com/go-resty/resty/v2 v2.3.0 h1:JOOeAvjSlapTT92p8xiS19Zxev1neGikoHsXJeOq8So= +github.com/go-resty/resty/v2 v2.3.0/go.mod h1:UpN9CgLZNsv4e9XG50UU8xdI0F43UQ4HmxLBDwaroHU= github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= +github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.4.3 h1:JjCZWpVbqXDqFVmTfYWEVTMIYrL/NPdPSCHPJ0T/raM= github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= @@ -30,14 +43,26 @@ github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/jaytaylor/html2text v0.0.0-20180606194806-57d518f124b0 h1:xqgexXAGQgY3HAjNPSaCqn5Aahbo5TKsmhp8VRfr1iQ= github.com/jaytaylor/html2text v0.0.0-20180606194806-57d518f124b0/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk= +github.com/jaytaylor/html2text v0.0.0-20200412013138-3577fbdbcff7 h1:g0fAGBisHaEQ0TRq1iBvemFRf+8AEWEmBESSiWB3Vsc= +github.com/jaytaylor/html2text v0.0.0-20200412013138-3577fbdbcff7/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk= github.com/levigross/exp-html v0.0.0-20120902181939-8df60c69a8f5 h1:W7p+m/AECTL3s/YR5RpQ4hz5SjNeKzZBl1q36ws12s0= github.com/levigross/exp-html v0.0.0-20120902181939-8df60c69a8f5/go.mod h1:QMe2wuKJ0o7zIVE8AqiT8rd8epmm6WDIZ2wyuBqYPzM= github.com/mattn/go-runewidth v0.0.3 h1:a+kO+98RDGEfo6asOGMmpodZq4FNtnGP54yps8BzLR4= github.com/mattn/go-runewidth v0.0.3/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= +github.com/mattn/go-runewidth v0.0.7/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= +github.com/mattn/go-runewidth v0.0.9 h1:Lm995f3rfxdpd6TSmuVCHVb/QhupuXlYr8sCI/QdE+0= +github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= github.com/olekukonko/tablewriter v0.0.0-20180506121414-d4647c9c7a84 h1:fiKJgB4JDUd43CApkmCeTSQlWjtTtABrU2qsgbuP0BI= github.com/olekukonko/tablewriter v0.0.0-20180506121414-d4647c9c7a84/go.mod h1:vsDQFd/mU46D+Z4whnwzcISnGGzXWMclvtLoiIKAKIo= +github.com/olekukonko/tablewriter v0.0.4 h1:vHD/YYe1Wolo78koG299f7V/VAS08c6IpCLn+Ejf/w8= +github.com/olekukonko/tablewriter v0.0.4/go.mod h1:zq6QwlOf5SlnkVbMSr5EoBv3636FWnp+qbPhuoO21uA= +github.com/otiai10/curr v0.0.0-20150429015615-9b4961190c95/go.mod h1:9qAhocn7zKJG+0mI8eUu6xqkFDYS2kb2saOteoSB3cE= +github.com/otiai10/gosseract/v2 v2.2.4/go.mod h1:ahOp/kHojnOMGv1RaUnR0jwY5JVa6BYKhYAS8nbMLSo= +github.com/otiai10/mint v1.3.0/go.mod h1:F5AjcsTsWUqX+Na9fpHb52P8pcRX2CI6A3ctIT91xUo= github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/simplereach/timeutils v1.2.0/go.mod h1:VVbQDfN/FHRZa1LSqcwo4kNZ62OOyqLLGQKYB3pB0Q8= github.com/sirupsen/logrus v1.7.0 h1:ShrD1U9pZB12TX0cVy0DtePoCH97K8EtX+mg7ZARUtM= @@ -50,15 +75,20 @@ github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81P golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190628185345-da137c7871d7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200602114024-627f9648deb9/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20210119194325-5f4716e94777 h1:003p0dJM77cxMSyCPFphvZf/Y5/NXf5fzg6ufd1/Oew= golang.org/x/net v0.0.0-20210119194325-5f4716e94777/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037 h1:YyJpGZS1sBuBCzLAR1VEpK193GlqGZbnPFnPV/5Rsb4= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68 h1:nxC68pudNYkKU6jWhgrqdreuFiOQWj1Fs7T3VrH4Pjw= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3 h1:cokOdA+Jmi5PJGXLlLllQSgYigAEfHXJAERHVMaCc2k= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= From 0aae0a232c86f0e13694d88ea973f25942d116b8 Mon Sep 17 00:00:00 2001 From: Ravf95 Date: Tue, 2 Mar 2021 15:49:25 -0300 Subject: [PATCH 07/10] new extractor flags --- parser/extract/extractor.go | 62 +++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/parser/extract/extractor.go b/parser/extract/extractor.go index 9c64d70..d76c08f 100644 --- a/parser/extract/extractor.go +++ b/parser/extract/extractor.go @@ -18,6 +18,8 @@ type Extractor struct { CurrLine int SavedLine int + + Buffer []string Flags ExtractorFlag } @@ -36,6 +38,12 @@ type ExtractorFlag int const ( // the tokens skip blank lines EXTRACTOR_FLAG_1 = 1<<(iota + 1) + + // trim leading and trailing spaces from tokens + EXTRACTOR_FLAG_2 + + // line tokenizer + EXTRACTOR_FLAG_3 ) func NewExtractor(raw string) *Extractor { @@ -48,14 +56,36 @@ func NewExtractor(raw string) *Extractor { func (e *Extractor) Scan() bool { scan := func(s *bufio.Scanner) (string, bool) { + text := "" + + if e.Flags & EXTRACTOR_FLAG_3 != 0 && + len(e.Buffer) > 1 { + text = e.Buffer[1] + e.Buffer = e.Buffer[1:] + return text, true + } + for s.Scan() { + text = s.Text() if e.Flags & EXTRACTOR_FLAG_1 != 0 { - if s.Text() == "" { + if text == "" { continue } } - return s.Text(), true + + if e.Flags & EXTRACTOR_FLAG_2 != 0 { + text = strings.TrimSpace(text) + } + + if e.Flags & EXTRACTOR_FLAG_3 != 0 && + text != "" { + e.Buffer = tokenize(text, 3) + text = e.Buffer[0] + } + + return text, true } + return "", false } @@ -267,6 +297,34 @@ func removeAccents(s string) string { return r.Replace(s) } +// split a line into words that not exceed the max continuous spaces +func tokenize(line string, max int) []string { + var tokens []string + var buffer strings.Builder + var spaces int + + line = strings.TrimSpace(line) + for _, letter := range line { + if letter == ' ' { + spaces++ + buffer.WriteRune(letter) + continue + } + + if spaces >= max { + token := strings.TrimSpace(buffer.String()) + if token != "" { + tokens = append(tokens, token) + } + buffer.Reset() + } + spaces = 0 + buffer.WriteRune(letter) + } + tokens = append(tokens, strings.TrimSpace(buffer.String())) + return tokens +} + /* legacy code support don't use these functions From 33f7fd529c042a30bcbd035b955617c0a8d6777a Mon Sep 17 00:00:00 2001 From: Ravf95 Date: Tue, 2 Mar 2021 15:59:24 -0300 Subject: [PATCH 08/10] update jobs extraction --- parser/extract/jobs.go | 351 ++++----------------------------------- parser/extract/parser.go | 11 +- 2 files changed, 40 insertions(+), 322 deletions(-) diff --git a/parser/extract/jobs.go b/parser/extract/jobs.go index 6e4ce7d..7a0871c 100644 --- a/parser/extract/jobs.go +++ b/parser/extract/jobs.go @@ -6,82 +6,42 @@ import ( "github.com/InstIDEA/ddjj/parser/declaration" ) -// TODO -// create a function to try sort inst-title pairs - -/* -analysis data - -raw data: - -linea: 57 ; INTITUCION: -linea: 64 ; CARGO: -linea: 66 ; i->POLICIA NACIONAL -linea: 70 ; INTITUCION: -linea: 71 ; c->RESGUARDO POLICIAL -linea: 79 ; CARGO: -linea: 79 ; c->SUB OFICIAL SEGUNDO -linea: 82 ; i->MINISTERIO DE TRABAJO, EMPLEO Y SEGURIDAD SOCIAL - -parser output: - - "instituciones": [ - { - "cargo": "RESGUARDO POLICIAL", - "institucion": "POLICIA NACIONAL" - }, - { - "cargo": "SUB OFICIAL SEGUNDO", - "institucion": "MINISTERIO DE TRABAJO, EMPLEO Y SEGURIDAD SOCIAL" - } - ] - -expected order: - -(SUB OFICIAL SEGUNDO, POLICIA NACIONAL) -(RESGUARDO POLICIAL, MINISTERIO DE TRABAJO, EMPLEO Y SEGURIDAD SOCIAL) -*/ - -var instsCache = map[string]bool{} - func Jobs(e *Extractor, parser *ParserData) []*declaration.Job { + e.BindFlag(EXTRACTOR_FLAG_1) + e.BindFlag(EXTRACTOR_FLAG_2) + var instituciones []*declaration.Job - var resultsPositions []int // for valid and invalid results var counter = countJobs(e) var successful int - instsCache = getInstsCache() - e.Rewind() - e.BindFlag(EXTRACTOR_FLAG_1) + e.BindFlag(EXTRACTOR_FLAG_3) job := &declaration.Job{ } if counter > 0 && e.MoveUntilStartWith(CurrToken, "DATOS LABORALES") { - e.SaveLine() for e.Scan() { if counter == successful { break } - if job.Cargo == "" { - value := getJobTitle(e, &resultsPositions) + if job.Institucion == "" { + value := getJobInst(e) - if isValidJobTitle(value) { - job.Cargo = value + if !isJobFormField(value) { + job.Institucion = value } } - if job.Cargo != "" && - job.Institucion == "" { - value := getJobInst(e, &resultsPositions) + if job.Cargo == "" && + job.Institucion != "" { + value := getJobTitle(e) - if isValidJobInst(value) { - job.Institucion = value - instsCache[value] = true + if !isJobFormField(value) { + job.Cargo = value } } @@ -89,7 +49,6 @@ func Jobs(e *Extractor, parser *ParserData) []*declaration.Job { successful++ instituciones = append(instituciones, job) job = &declaration.Job{ } - e.MoveUntilSavedLine() } } } @@ -106,124 +65,36 @@ func Jobs(e *Extractor, parser *ParserData) []*declaration.Job { return instituciones } -func getJobTitle(e *Extractor, pos *[]int) string { - // control positions are based on first matching tokens +func getJobTitle(e *Extractor) string { - var value string - - if strings.Contains(e.CurrToken, "CARGO") && - !ContainsIntItem(*pos, e.CurrLineNum()) { + if strings.Contains(e.CurrToken, "CARGO") { val, check := isKeyValuePair(e.CurrToken, "CARGO") if check { - *pos = append(*pos, e.CurrLineNum()) - e.MoveUntilSavedLine() return val } } - if isCurrLine(e.CurrToken, "INSTITUCIÓN") { - if !ContainsIntItem(*pos, e.PrevLineNum()) { - value = e.PrevToken - *pos = append(*pos, e.PrevLineNum()) - e.MoveUntilSavedLine() - return value - } - - if !ContainsIntItem(*pos, e.NextLineNum()) { - value = e.NextToken - *pos = append(*pos, e.NextLineNum()) - e.MoveUntilSavedLine() - return value - } - } + if strings.Contains(e.PrevToken, "CARGO") && + strings.Contains(e.CurrToken, "FECHA EGRESO") { + return e.NextToken - if isCurrLine(e.PrevToken, "TELÉFONO") && - !isCurrLine(e.NextToken, "CARGO") && - !ContainsIntItem(*pos, e.CurrLineNum()) { - value = e.CurrToken - *pos = append(*pos, e.CurrLineNum()) - e.MoveUntilSavedLine() - return value } return "" } -func getJobInst(e *Extractor, pos *[]int) string { - var value string +func getJobInst(e *Extractor) string { - if instsCache[removeAccents(e.CurrToken)] && - !ContainsIntItem(*pos, e.CurrLineNum()) { - value = e.CurrToken - *pos = append(*pos, e.CurrLineNum()) - e.MoveUntilSavedLine() - return value + if strings.Contains(e.PrevToken, "INSTITUCIÓN") && + strings.Contains(e.NextToken, "ACTO ADM. COM") { + return e.CurrToken } - // control positions are based on first matching tokens - - if isCurrLine(e.PrevToken, "INSTITUCIÓN") && - isCurrLine(e.CurrToken, "TIPO") && - !ContainsIntItem(*pos, e.NextLineNum()) { - value = e.NextToken - *pos = append(*pos, e.NextLineNum()) - e.MoveUntilSavedLine() - return value - } - - if isCurrLine(e.PrevToken, "DIRECCIÓN") { - _, check := isKeyValuePair(e.PrevToken, "DIRECCIÓN") - - if check && - !ContainsIntItem(*pos, e.NextLineNum()) { - value = e.NextToken - *pos = append(*pos, e.NextLineNum()) - e.MoveUntilSavedLine() - return value - } - - if !ContainsIntItem(*pos, e.CurrLineNum()) { - value = e.CurrToken - *pos = append(*pos, e.CurrLineNum()) - e.MoveUntilSavedLine() - return value - } - } - - if isCurrLine(e.CurrToken, "FECHA ASUNC") && - !ContainsIntItem(*pos, e.NextLineNum()) { - value = e.NextToken - *pos = append(*pos, e.NextLineNum()) - e.MoveUntilSavedLine() - return value + if strings.Contains(e.PrevToken, "DIRECCIÓN") && + isNumber(e.CurrToken) { + return e.NextToken } - if isCurrLine(e.PrevToken, "COMISIONADO") && - isCurrLine(e.NextToken, "DEPENDENCIA") && - !ContainsIntItem(*pos, e.CurrLineNum()) { - value = e.CurrToken - *pos = append(*pos, e.CurrLineNum()) - e.MoveUntilSavedLine() - return value - } - - if isCurrLine(e.NextToken, "CARGO") && - isCurrLine(e.CurrToken, "NOMBRADO/CONTRATADO") && - !ContainsIntItem(*pos, e.PrevLineNum()) { - value = e.PrevToken - *pos = append(*pos, e.PrevLineNum()) - e.MoveUntilSavedLine() - return value - } - - if isCurrLine(e.NextToken, "COMISIONADO") && - !ContainsIntItem(*pos, e.PrevLineNum()) { - value = e.PrevToken - *pos = append(*pos, e.PrevLineNum()) - e.MoveUntilSavedLine() - return value - } - return "" } @@ -231,168 +102,18 @@ func countJobs(e *Extractor) int { var counter int for e.Scan() { - // first position - if isCurrLine(e.CurrToken, "CARGO") { - counter++ - continue - } - - // middle position - if hasLeadingSpaces(e.CurrToken, "CARGO") && - !endsWith(e.CurrToken, "CARGO") { + if strings.Contains(e.CurrToken, "CARGO:") { counter++ } } return counter } -func isValidJobTitle(s string) bool { - if len(s) < 5 { // minimum length for job title - return false - } - - if isDate(s) { - return false - } - - if isNumber(s) { - return false - } - - if isPhoneNumber(s) { - return false - } - - if isJobFormField(s) { - return false - } - - if isJobFormCommonAnswer(s) { - return false - } - - return true -} - -func isValidJobInst(s string) bool { - - if containsKWOfInsts(s) { - return true - } - - if len(s) < 5 { // minimum length for institute - return false - } - - if isDate(s) { - return false - } - - if isNumber(s) { - return false - } - - if isPhoneNumber(s) { - return false - } - - if isAddressStreet(s) { - return false - } - - if isJobFormField(s) { - return false - } - - if isJobFormCommonAnswer(s) { - return false - } - - return true -} - -func getInstsCache() map[string]bool { - // last update 18/02/2020 - // from https://datos.sfp.gov.py/data/oee and parser results - institutions := [200]string{ "ADMINISTRACION NACIONAL DE ELECTRICIDAD", "ADMINISTRACION NACIONAL DE NAVEGACION Y PUERTOS", "AGENCIA FINANCIERA DE DESARROLO", - "AGENCIA NACIONAL DE EVALUACION Y ACREDITACION DE LA EDUCACION", "AGENCIA NACIONAL DE TRANSITO Y SEGURIDAD VIAL", "ARMADA NACIONAL", - "AUDITORIA GENERAL DEL PODER EJECUTIVO", "AUTORIDAD REGULADORA RADIOLOGICA Y NUCLEAR", "BANCO CENTRAL DEL PARAGUAY", "BANCO NACIONAL DE FOMENTO", - "CAJA DE JUBILACIONES Y PENSIONES DE EMPLEADOS BANCARIOS", "CAJA DE JUBILACIONES Y PENSIONES DEL PENSONAL DE BANCOS Y AFINES", - "CAJA DE JUBILACIONES Y PENSIONES DEL PERSONAL DE LA ANDE", "CAJA DE JUBILACIONES Y PENSIONES DEL PERSONAL MUNICIPAL", - "CAJA DE PRESTAMOS DEL MINISTERIO DE DEFENSA NACIONAL", "CAJA DE SEGURIDAD SOCIAL DE EMPLEADOS Y OBREROS FERROVIARIOS", "CAMARA DE DIPUTADOS", - "CAMARA DE SENADORES", "CAÑAS PARAGUAYAS", "COMANDO DE LAS FUERZAS MILITARES", "COMISION NACIONAL DE LA COMPETENCIA", - "COMISION NACIONAL DE TELECOMUNICACIONES", "COMISION NACIONAL DE VALORES", "COMPAÑIA PARAGUAYA DE COMUNICACIONES", "CONGRESO NACIONAL", - "CONSEJO DE LA MAGISTRATURA", "CONSEJO NACIONAL DE CIENCIA Y TECNOLOGIA", "CONSEJO NACIONAL DE EDUCACION SUPERIOR", "CONTRALORIA GENERAL DE LA REPUBLICA", - "CORTE SUPREMA DE JUSTICIA", "CREDITO AGRICOLA DE HABILITACION", "DEFENSORIA DEL PUEBLO", "DIRECCION DE BENEFICENCIA Y AYUDA SOCIAL", "DIRECCION DE CONTRATACIONES", - "DIRECCION GENERAL DE ESTADISTICA, ENCUESTA Y CENSO", "DIRECCION NACIONAL DE ADUANAS", "DIRECCION NACIONAL DE AERONAUTICA CIVIL", "DIRECCION NACIONAL DE BENEFICENCIA", - "DIRECCION NACIONAL DE CONTRATACIONES PUBLICAS", "DIRECCION NACIONAL DE CORREOS DEL PARAGUAY", "DIRECCION NACIONAL DE DEFENSA, SALUD Y BIENESTAR ANIMAL", - "DIRECCION NACIONAL DEL REGISTRO DEL ESTADO CIVIL DE LAS PERSONAS", "DIRECCION NACIONAL DE PROPIEDAD INTELECTUAL", "DIRECCION NACIONAL DE TRANSPORTE", - "EMPRESA DE SERVICIOS SANITARIOS DEL PARAGUAY", "ENTE REGULADOR DE SERVICIOS SANITARIOS", "FERROCARRILES DEL PARAGUAY", "FONDO GANADERO", - "FONDO NACIONAL DE LA CULTURA Y LAS ARTES", "GOBERNACION DE ALTO PARAGUAY", "GOBERNACION DE ALTO PARANA", "GOBERNACION DE AMAMBAY", "GOBERNACION DE BOQUERON", - "GOBERNACION DE CAAGUAZU", "GOBERNACION DE CAAZAPA", "GOBERNACION DE CANINDEYU", "GOBERNACION DE CONCEPCION", "GOBERNACION DE CORDILLERA", "GOBERNACION DE GUAIRA", - "GOBERNACION DE ITAPUA", "GOBERNACION DEL GUAIRA", "GOBERNACION DE MISIONES", "GOBERNACION DE ÑEEMBUCU", "GOBERNACION DE PARAGUARI", "GOBERNACION DE PRESIDENTE HAYES", - "GOBERNACION DE SAN PEDRO", "GOBIERNO DEPARTAMENTAL DE CENTRAL", "GOBIERNO DEPARTAMENTAL DE PARAGUARI", "HOSPITAL DE EMERGENCIAS MEDICAS", - "HOSPITAL REGIONAL SALTO DEL GUAIRA", "INDUSTRIA NACIONAL DEL CEMENTO", "INSTITUTO DE PREVISION SOCIAL", "INSTITUTO FORESTAL NACIONAL", - "INSTITUTO NACIONAL DE ALIMENTACION Y NUTRICION", "INSTITUTO NACIONAL DE COOPERATIVISMO", "INSTITUTO NACIONAL DE DESARROLLO RURAL Y DE LA TIERRA", - "INSTITUTO NACIONAL DE EDUCACION SUPERIOR DR. RAUL PEÑA", "INSTITUTO NACIONAL DEL COOPERATIVISMO", "INSTITUTO NACIONAL DEL INDIGENA", "INSTITUTO NACIONAL DE TECNOLOGIA", - "INSTITUTO PARAGUAYO DE ARTESANI", "INSTITUTO PARAGUAYO DEL INDIGENA", "INSTITUTO PARAGUAYO DE TECNOLOGIA AGRARIA", "INSTITUTO SUPERIOR DE BELLAS ARTES", "ITAIPU", - "JURADO DE ENJUICIAMIENTO DE MAGISTRADOS", "MECANISMO NACIONAL DE PREVENSION CONTRA LA TORTURA", "MINISTERIO DE AGRICULTURA Y GANADERIA", "MINISTERIO DE DEFENSA NACIONAL", - "MINISTERIO DE DESARROLLO SOCIAL", "MINISTERIO DE EDUCACION Y CULTURA", "MINISTERIO DE HACIENDA", "MINISTERIO DE INDUSTRIA Y COMERCIO", "MINISTERIO DE JUSTICIA", - "MINISTERIO DE LA DEFENSA PUBLICA", "MINISTERIO DEL AMBIENTE Y DESARROLLO SOSTENIBLE", "MINISTERIO DE LA MUJER", "MINISTERIO DE LA NIÑEZ Y ADOLESCENCIA", - "MINISTERIO DEL INTERIOR", "MINISTERIO DEL TRABAJO, EMPLEO Y SEGURIDAD SOCIAL", "MINISTERIO DE OBRAS PUBLICAS Y COMUNICACIONES", "MINISTERIO DE RELACIONES EXTERIORES", - "MINISTERIO DE SALUD PUBLICA Y BIENESTAR SOCIAL", "MINISTERIO DE TECNOLOGIAS DE LA INFORMACION Y COMUNICACION", "MINISTERIO DE TRABAJO, EMPLEO Y SEGURIDAD SOCIAL", - "MINISTERIO DE URBANISMO, VIVIENDA Y HABITAT", "MINISTERIO PUBLICO", "MINISTERIO PUBLICO FISCALIA GENERAL DEL ESTADO", "MUNICIPALIDAD DE 3 DE MAYO", - "MUNICIPALIDAD DE ABAI", "MUNICIPALIDAD DE ASUNCION", "MUNICIPALIDAD DE CAACUPE", "MUNICIPALIDAD DE CAAPUCU", "MUNICIPALIDAD DE CAMBYRETA", "MUNICIPALIDAD DE CAPIATA", - "MUNICIPALIDAD DE CAPITAN BADO", "MUNICIPALIDAD DE CARAPEGUA", "MUNICIPALIDAD DE CIUDAD DEL ESTE", "MUNICIPALIDAD DE CORONEL BOGADO", "MUNICIPALIDAD DE DR. JUAN LEON MALLORQUIN", - "MUNICIPALIDAD DE EDELIRA", "MUNICIPALIDAD DE ENCARNACION", "MUNICIPALIDAD DE FERNANDO DE LA MORA", "MUNICIPALIDAD DE FILADELFIA", "MUNICIPALIDAD DE FUERTE OLIMPO", - "MUNICIPALIDAD DE FULGENCIO YEGROS", "MUNICIPALIDAD DE GENERAL ARTIGAS", "MUNICIPALIDAD DE HERNANDARIAS", "MUNICIPALIDAD DE HOHENAU", "MUNICIPALIDAD DE HORQUETA", - "MUNICIPALIDAD DE ITAKYRY", "MUNICIPALIDAD DE ITAUGUA", "MUNICIPALIDAD DE JUAN EMILIO O’LEARY", "MUNICIPALIDAD DE KATUETE", "MUNICIPALIDAD DE LAMBARE", "MUNICIPALIDAD DE LA PAZ", - "MUNICIPALIDAD DE LOMA PLATA", "MUNICIPALIDAD DE LUQUE", "MUNICIPALIDAD DE MCAL. FRANCISCO SOLANO LOPEZ", "MUNICIPALIDAD DE NATALIO", "MUNICIPALIDAD DE NUEVA TOLEDO", - "MUNICIPALIDAD DE PARAGUARI", "MUNICIPALIDAD DE PASO BARRETO", "MUNICIPALIDAD DE PEDRO JUAN CABALLERO", "MUNICIPALIDAD DE QUYQUYHO", "MUNICIPALIDAD DE SAN COSME Y DAMIAN", - "MUNICIPALIDAD DE SAN JUAN DEL PARANA", "MUNICIPALIDAD DE SAN JUAN NEPOMUCENO", "MUNICIPALIDAD DE SAN LORENZO", "MUNICIPALIDAD DE SAN PEDRO DEL YCUAMANDIYU", - "MUNICIPALIDAD DE SAN RAFAEL DEL PARANA", "MUNICIPALIDAD DE SAPUCAI", "MUNICIPALIDAD DE SARGENTO JOSE FELIX LOPEZ", "MUNICIPALIDAD DE TEMBIAPORA", "MUNICIPALIDAD DE TOBATI", - "MUNICIPALIDAD DE TTE.1º MANUEL IRALA FERNANDEZ", "MUNICIPALIDAD DE VILLA ELISA", "MUNICIPALIDAD DE VILLETA", "MUNICIPALIDAD DE YBY PYTA", "MUNICIPALIDAD DE YBY YAU", - "MUNICIPALIDAD DE YRYBUCUA", "PETROLEOS PARAGUAYOS", "PODER JUDICIAL", "POLICIA NACIONAL", "PRESIDENCIA DE LA REPUBLICA", "PROCURADURIA GENERAL DE LA REPUBLICA", "SECRETARIA DE ACCION SOCIAL", - "SECRETARIA DE DEFENSA AL CONSUMIDOR Y AL USUARIO", "SECRETARIA DE EMERGENCIA NACIONAL", "SECRETARIA DE INFORMACION Y COMUNICACION", "SECRETARIA DEL AMBIENTE", - "SECRETARIA DE LA NIÑEZ Y DE LA ADOLESCENCIA", "SECRETARIA DE PREVENCION DE LAVADO DE DINERO O BIENES", "SECRETARIA NACIONAL ANTICORRUPCION", "SECRETARIA NACIONAL ANTIDROGAS", - "SECRETARIA NACIONAL DE DEPORTES", "SECRETARIA NACIONAL DE INTELIGENCIA", "SECRETARIA NACIONAL DE LA CULTURA", "SECRETARIA NACIONAL DE LA JUVENTUD", - "SECRETARIA NACIONAL DE LA VIVIENDA Y EL HABITAT", "SECRETARIA NACIONAL DE TURISMO", "SECRETARIA TECNICA DE PLANIFICACION", "SERVICIO NACIONAL DE CALIDAD VEGETAL Y DE SEMILLA", - "SERVICIO NACIONAL DE CALIDAD Y SALUD ANIMAL", "SERVICIO NACIONAL DE CALIDAD Y SANIDAD VEGETAL Y DE SEMILLAS", "SERVICIO NACIONAL DE SANEAMIENTO AMBIENTAL", - "SERVICIO NACIONAL DE PROMOCION PROFESIONAL", "SINDICATURA GENERAL DE QUIEBRAS", "TRIBUNAL SUPERIOR DE JUSTICIA ELECTORAL", "UNIVERSIDAD NACIONAL DE ASUNCION", - "UNIVERSIDAD NACIONAL DE CAAGUAZU", "UNIVERSIDAD NACIONAL DE CANINDEYU", "UNIVERSIDAD NACIONAL DE CONCEPCION", "UNIVERSIDAD NACIONAL DE ITAPUA", "UNIVERSIDAD NACIONAL DEL ESTE", - "UNIVERSIDAD NACIONAL DE PILAR", "UNIVERSIDAD NACIONAL DE VILLARRICA DEL ESPIRITU SANTO", "VICEPRESIDENCIA DE LA REPUBLICA", "INSTITUTO DE PREVISION SOCIAL", "YACYRETA", } - - cache := map[string]bool{} - - for _, item := range institutions { - cache[item] = true - } - return cache -} - -func containsKWOfInsts(s string) bool { - kw := [40]string { "ENTIDAD","MUNICIPALIDAD","CAJA DE","CONSEJO","UNIVERSIDAD", - "CREDITO","EMPRESA","AGENCIA","SECRETARIA","POLICIA","ADMINISTRACION","MINISTERIO", - "CONGRESO","CAMARA","PODER","TRIBUNAL","INSTITUTO","COMANDO","GOBERNACION","CORTE", - "SERVICIO","DIRECCION","COMPAÑIA","HOSPITAL","INDUSTRIA","JURADO","SINDICATURA", - "MECANISMO","BANCO","FONDO","ENTE","FACULTAD","INDUSTRIA","AUDITORIA","PROCURADURIA", - "COMISION","HONORABLE","ARMADA","NACIONAL","COLEGIO", } - - s = removeAccents(s) - for _, value := range kw { - // if kw is first word of current token - if hasTrailingSpaces(s, value) { - return true - } - } - return false -} - func isJobFormField(s string) bool { formField := []string { "TIPO", - "INSTITUCION", - "DIRECCION", + "INSTITUCION:", + "DIRECCION:", "DEPENDENCIA", "CATEGORIA", "NOMBRADO/CONTRATADO", @@ -401,9 +122,10 @@ func isJobFormField(s string) bool { "ACTO ADMINIST", "FECHA ACT. ADM", "TELEFONO", - "COMISIONADO", + "COMISIONADO:", "FECHA INGRESO", "FECHA EGRESO", + "ACTO ADM. COM", } s = removeAccents(s) @@ -415,16 +137,3 @@ func isJobFormField(s string) bool { return false } - -func isJobFormCommonAnswer(s string) bool { - commonAnswer := []string{ "SI", "NO", "PERSONAL DE BLANCO", "RECEPCIONADO" } - - s = removeAccents(s) - for _, value := range commonAnswer { - if s == value { - return true - } - } - - return false -} diff --git a/parser/extract/parser.go b/parser/extract/parser.go index c2d2b04..9498d34 100644 --- a/parser/extract/parser.go +++ b/parser/extract/parser.go @@ -85,6 +85,15 @@ func ParsePDF(file io.Reader) ParserData { return parser } + // maintain original physical layout + pl_res, pl_err := docconv.Convert(file, "application/pdf", true) + + if pl_err != nil { + parser.addError(pl_err) + parser.Status = 1 + return parser + } + parser.rawData(res.Body) d := &declaration.Declaration{} @@ -99,7 +108,7 @@ func ParsePDF(file io.Reader) ParserData { d.Conyuge = parser.checkStr(Spouse(NewExtractor(res.Body))) // Jobs - d.Instituciones = Jobs(NewExtractor(res.Body), &parser) + d.Instituciones = Jobs(NewExtractor(pl_res.Body), &parser) // Deposits scanner := bufio.NewScanner(strings.NewReader(res.Body)) From 67c820f73bbacc7231dc7549363276aa8382cb9f Mon Sep 17 00:00:00 2001 From: Ravf95 Date: Wed, 3 Mar 2021 08:48:09 -0300 Subject: [PATCH 09/10] clear extractor buffer on rewind --- parser/extract/extractor.go | 1 + 1 file changed, 1 insertion(+) diff --git a/parser/extract/extractor.go b/parser/extract/extractor.go index d76c08f..e27b10a 100644 --- a/parser/extract/extractor.go +++ b/parser/extract/extractor.go @@ -136,6 +136,7 @@ func (e *Extractor) MoveUntilSavedLine() { func (e *Extractor) Rewind() { e.Scanner = bufio.NewScanner(strings.NewReader(e.RawData)) + e.Buffer = []string{} e.CurrLine = 0 e.PrevToken = "" e.CurrToken = "" From 46dc77c588ecc3aad3ee85bec808ba8ddbb49a27 Mon Sep 17 00:00:00 2001 From: Ravf95 Date: Wed, 3 Mar 2021 08:58:30 -0300 Subject: [PATCH 10/10] update flag description --- parser/extract/extractor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parser/extract/extractor.go b/parser/extract/extractor.go index e27b10a..61faee4 100644 --- a/parser/extract/extractor.go +++ b/parser/extract/extractor.go @@ -39,7 +39,7 @@ const ( // the tokens skip blank lines EXTRACTOR_FLAG_1 = 1<<(iota + 1) - // trim leading and trailing spaces from tokens + // remove begin and end spaces from tokens EXTRACTOR_FLAG_2 // line tokenizer