Skip to content

Commit

Permalink
Merge pull request #887 from xxyzz/nl
Browse files Browse the repository at this point in the history
[nl] improve extract inflection and linakge data code
  • Loading branch information
xxyzz authored Oct 25, 2024
2 parents 7f16b89 + 04fb3f7 commit 03e0faf
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 10 deletions.
22 changes: 13 additions & 9 deletions src/wiktextract/extractor/nl/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,19 @@ def extract_noun_adj_table(
if col_index == 0:
row_header = clean_node(wxr, None, data_node)
else:
form_str = clean_node(wxr, None, data_node)
if form_str not in ["", "-", wxr.wtp.title]:
form = Form(form=form_str)
if row_header not in ["", "naamwoord"]:
form.raw_tags.append(row_header)
if col_index - 1 < len(column_headers):
form.raw_tags.append(column_headers[col_index - 1])
translate_raw_tags(form)
word_entry.forms.append(form)
for form_str in clean_node(
wxr, None, data_node
).splitlines():
if form_str not in ["", "-", wxr.wtp.title]:
form = Form(form=form_str)
if row_header not in ["", "naamwoord"]:
form.raw_tags.append(row_header)
if col_index - 1 < len(column_headers):
form.raw_tags.append(
column_headers[col_index - 1]
)
translate_raw_tags(form)
word_entry.forms.append(form)

for link_node in expanded_node.find_child(NodeKind.LINK):
clean_node(wxr, word_entry, link_node)
Expand Down
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/nl/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ def extract_linkage_list_item(
getattr(word_entry, linkage_type).append(
Linkage(word=word, sense=sense, sense_index=sense_index)
)
elif isinstance(node, TemplateNode) and node.template_name == "expr":
extract_expr_template(wxr, word_entry, node, linkage_type)


def extract_nld_template(
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/nl/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@
# "familie": "family",
"farmacologie": "pharmacology",
# "feest": "party",
# "fietsen": "cycle",
"fietsen": "cycling",
"filatelie": "philately",
"filmkunst": "cinematography",
"filosofie": "philosophy",
Expand Down
34 changes: 34 additions & 0 deletions tests/test_nl_inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,3 +196,37 @@ def test_nlstam(self):
},
],
)

def test_nlnoun_lines(self):
self.wxr.wtp.add_page(
"Sjabloon:-nlnoun-",
10,
"""{| class="infobox"
|-
!
! [[enkelvoud]]
! [[meervoud]]
|-
| class="infoboxrijhoofding" | [[zelfstandig naamwoord|naamwoord]]
| corpus
| [[corpora]]<br>[[corpussen]]
|}""",
)
data = parse_page(
self.wxr,
"corpus",
"""==Nederlands==
=====Woordherkomst en -opbouw=====
*Leenwoord
{{-nlnoun-|{{pn}}|[[corpora]]<br>[[{{pn}}sen]]|[[corpusje]]|[[corpusjes]]}}
====Zelfstandig naamwoord====
{{-l-|n}}
# alle verzamelde""",
)
self.assertEqual(
data[0]["forms"],
[
{"form": "corpora", "tags": ["plural"]},
{"form": "corpussen", "tags": ["plural"]},
],
)

0 comments on commit 03e0faf

Please sign in to comment.