#212 #238 Hindi/Urdu under Hindustani and combining queries

scribe-org · Oct 10, 2024 · ca4698b · ca4698b
1 parent 213769a
commit ca4698b
Show file tree

Hide file tree

Showing 15 changed files with 223 additions and 73 deletions.
diff --git a/src/scribe_data/language_data_extraction/Hindi/Adjectives/query_adjectives.sparql b/src/scribe_data/language_data_extraction/Hindi/Adjectives/query_adjectives.sparql
diff --git a/src/scribe_data/language_data_extraction/Hindi/emoji_keywords/___init__.py b/src/scribe_data/language_data_extraction/Hindi/emoji_keywords/___init__.py
diff --git a/src/scribe_data/language_data_extraction/Hindustani/Hindi/adjectives/query_adjectives.sparql b/src/scribe_data/language_data_extraction/Hindustani/Hindi/adjectives/query_adjectives.sparql
@@ -0,0 +1,162 @@
+# tool: scribe-data
+# All Hindi (from Hindustani Q11051) adjectives with the included grammatical forms.
+# Enter this query at https://query.wikidata.org/.
+
+# Note: We need to filter for "hi" to remove Urdu (ur) words.
+
+SELECT DISTINCT
+ (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
+ ?adjective
+ ?singulativeNumeral
+ ?collectiveNumeral
+ ?femSingularDirect
+ ?masSingularDirect
+ ?femPluralDirect
+ ?masPluralDirect
+ ?femSingularOblique
+ ?masSingularOblique
+ ?femPluralOblique
+ ?masPluralOblique
+ ?femSingularVocative
+ ?masSingularVocative
+ ?femPluralVocative
+ ?masPluralVocative
+
+WHERE {
+ ?lexeme dct:language wd:Q11051 ;
+ wikibase:lexicalCategory wd:Q34698 ;
+ wikibase:lemma ?adjective .
+ FILTER(lang(?adjective) = "hi")
+
+ # MARK: Singulative Numeral
+
+ OPTIONAL {
+ ?lexeme ontolex:lexicalForm ?singulativeNumeralForm .
+ ?singulativeNumeralForm ontolex:representation ?singulativeNumeral ;
+ wikibase:grammaticalFeature wd:Q110786 .
+ FILTER(LANG(?singulativeNumeral) = "hi")
+ }
+
+ # MARK: Collective Numeral
+
+ OPTIONAL {
+ ?lexeme ontolex:lexicalForm ?collectiveNumeralForm .
+ ?collectiveNumeralForm ontolex:representation ?collectiveNumeral ;
+ wikibase:grammaticalFeature wd:Q146786 .
+ FILTER(LANG(?collectiveNumeral) = "hi")
+ }
+
+ # MARK: Direct
+
+ OPTIONAL {
+ ?lexeme ontolex:lexicalForm ?femSingularDirectForm .
+ ?femSingularDirectForm ontolex:representation ?femSingularDirect ;
+ wikibase:grammaticalFeature wd:Q1775415 ;
+ wikibase:grammaticalFeature wd:Q110786 ;
+ wikibase:grammaticalFeature wd:Q1751855 ;
+ FILTER(LANG(?femSingularDirect) = "hi")
+ } .
+
+ OPTIONAL {
+ ?lexeme ontolex:lexicalForm ?masSingularDirectForm .
+ ?masSingularDirectForm ontolex:representation ?masSingularDirect ;
+ wikibase:grammaticalFeature wd:Q499327 ;
+ wikibase:grammaticalFeature wd:Q110786 ;
+ wikibase:grammaticalFeature wd:Q1751855 ;
+ FILTER(LANG(?masSingularDirect) = "hi")
+ } .
+
+ OPTIONAL {
+ ?lexeme ontolex:lexicalForm ?femPluralDirectForm .
+ ?femPluralDirectForm ontolex:representation ?femPluralDirect ;
+ wikibase:grammaticalFeature wd:Q1775415 ;
+ wikibase:grammaticalFeature wd:Q146786 ;
+ wikibase:grammaticalFeature wd:Q1751855 ;
+ FILTER(LANG(?femPluralDirect) = "hi")
+ } .
+
+ OPTIONAL {
+ ?lexeme ontolex:lexicalForm ?masPluralDirectForm .
+ ?masPluralDirectForm ontolex:representation ?masPluralDirect ;
+ wikibase:grammaticalFeature wd:Q499327 ;
+ wikibase:grammaticalFeature wd:Q146786 ;
+ wikibase:grammaticalFeature wd:Q1751855 ;
+ FILTER(LANG(?masPluralDirect) = "hi")
+ } .
+
+ # MARK: Oblique
+
+ OPTIONAL {
+ ?lexeme ontolex:lexicalForm ?femSingularObliqueForm .
+ ?femSingularObliqueForm ontolex:representation ?femSingularOblique ;
+ wikibase:grammaticalFeature wd:Q1775415 ;
+ wikibase:grammaticalFeature wd:Q110786 ;
+ wikibase:grammaticalFeature wd:Q1233197 ;
+ FILTER(LANG(?femSingularOblique) = "hi")
+ } .
+
+ OPTIONAL {
+ ?lexeme ontolex:lexicalForm ?masSingularObliqueForm .
+ ?masSingularObliqueForm ontolex:representation ?masSingularOblique ;
+ wikibase:grammaticalFeature wd:Q499327 ;
+ wikibase:grammaticalFeature wd:Q110786 ;
+ wikibase:grammaticalFeature wd:Q1233197 ;
+ FILTER(LANG(?masSingularOblique) = "hi")
+ } .
+
+ OPTIONAL {
+ ?lexeme ontolex:lexicalForm ?femPluralObliqueForm .
+ ?femPluralObliqueForm ontolex:representation ?femPluralOblique ;
+ wikibase:grammaticalFeature wd:Q1775415 ;
+ wikibase:grammaticalFeature wd:Q146786 ;
+ wikibase:grammaticalFeature wd:Q1233197 ;
+ FILTER(LANG(?femPluralOblique) = "hi")
+ } .
+
+ OPTIONAL {
+ ?lexeme ontolex:lexicalForm ?masPluralObliqueForm .
+ ?masPluralObliqueForm ontolex:representation ?masPluralOblique ;
+ wikibase:grammaticalFeature wd:Q499327 ;
+ wikibase:grammaticalFeature wd:Q146786 ;
+ wikibase:grammaticalFeature wd:Q1233197 ;
+ FILTER(LANG(?masPluralOblique) = "hi")
+ } .
+
+ # MARK: Vocative
+
+ OPTIONAL {
+ ?lexeme ontolex:lexicalForm ?femSingularVocativeForm .
+ ?femSingularVocativeForm ontolex:representation ?femSingularVocative ;
+ wikibase:grammaticalFeature wd:Q1775415 ;
+ wikibase:grammaticalFeature wd:Q110786 ;
+ wikibase:grammaticalFeature wd:Q185077 ;
+ FILTER(LANG(?femSingularVocative) = "hi")
+ } .
+
+ OPTIONAL {
+ ?lexeme ontolex:lexicalForm ?masSingularVocativeForm .
+ ?masSingularVocativeForm ontolex:representation ?masSingularVocative ;
+ wikibase:grammaticalFeature wd:Q499327 ;
+ wikibase:grammaticalFeature wd:Q110786 ;
+ wikibase:grammaticalFeature wd:Q185077 ;
+ FILTER(LANG(?masSingularVocative) = "hi")
+ } .
+
+ OPTIONAL {
+ ?lexeme ontolex:lexicalForm ?femPluralVocativeForm .
+ ?femPluralVocativeForm ontolex:representation ?femPluralVocative ;
+ wikibase:grammaticalFeature wd:Q1775415 ;
+ wikibase:grammaticalFeature wd:Q146786 ;
+ wikibase:grammaticalFeature wd:Q185077 ;
+ FILTER(LANG(?femPluralVocative) = "hi")
+ } .
+
+ OPTIONAL {
+ ?lexeme ontolex:lexicalForm ?masPluralVocativeForm .
+ ?masPluralVocativeForm ontolex:representation ?masPluralVocative ;
+ wikibase:grammaticalFeature wd:Q499327 ;
+ wikibase:grammaticalFeature wd:Q146786 ;
+ wikibase:grammaticalFeature wd:Q185077 ;
+ FILTER(LANG(?masPluralVocative) = "hi")
+ } .
+}
diff --git a/...action/Hindi/adverbs/query_adverbs.sparql → ...ustani/Hindi/adverbs/query_adverbs.sparql b/...action/Hindi/adverbs/query_adverbs.sparql → ...ustani/Hindi/adverbs/query_adverbs.sparql
@@ -1,7 +1,8 @@
 # tool: scribe-data
 # All Hindi (from Hindustani Q11051) adverbs.
 # Enter this query at https://query.wikidata.org/.
-# Note the necessity to filter for "hi" to remove Urdu (ur) words.
+
+# Note: We need to filter for "hi" to remove Urdu (ur) words.
 
 SELECT DISTINCT
  (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)

diff --git a/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/___init__.py b/src/scribe_data/language_data_extraction/Hindustani/Hindi/emoji_keywords/___init__.py
diff --git a/...emoji_keywords/generate_emoji_keywords.py → ...emoji_keywords/generate_emoji_keywords.py b/...emoji_keywords/generate_emoji_keywords.py → ...emoji_keywords/generate_emoji_keywords.py
@@ -25,8 +25,7 @@
 from scribe_data.unicode.process_unicode import gen_emoji_lexicon
 from scribe_data.utils import export_formatted_data
 
-LANGUAGE = "Hindustani" # Broad language category
-LANGUAGE_CODE = "hi" # Specific filter for Hindi
+LANGUAGE = "Hindi"
 DATA_TYPE = "emoji-keywords"
 emojis_per_keyword = 3
 
@@ -38,7 +37,6 @@
 if emoji_keywords_dict := gen_emoji_lexicon(
  language=LANGUAGE,
  emojis_per_keyword=emojis_per_keyword,
- filter_language_code=LANGUAGE_CODE, # Adding filter for Hindi language code "hi"
 ):
  export_formatted_data(
  file_path=args.file_path,

diff --git a/...extraction/Hindi/nouns/query_nouns.sparql → ...Hindustani/Hindi/nouns/query_nouns.sparql b/...extraction/Hindi/nouns/query_nouns.sparql → ...Hindustani/Hindi/nouns/query_nouns.sparql
@@ -1,7 +1,8 @@
 # tool: scribe-data
 # All Hindi (from Hindustani Q11051) nouns and their gender.
 # Enter this query at https://query.wikidata.org/.
-# Note the necessity to filter for "hi" to remove Urdu (ur) words.
+
+# Note: We need to filter for "hi" to remove Urdu (ur) words.
 
 SELECT DISTINCT
  (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)

diff --git a/.../Postpositions/query_postpositions.sparql → .../postpositions/query_postpositions.sparql b/.../Postpositions/query_postpositions.sparql → .../postpositions/query_postpositions.sparql
@@ -1,7 +1,8 @@
 # tool: scribe-data
 # All Hindi (from Hindustani Q11051) postpositions.
 # Enter this query at https://query.wikidata.org/.
-# Note the necessity to filter for "hi" to remove Urdu (ur) words.
+
+# Note: We need to filter for "hi" to remove Urdu (ur) words.
 
 SELECT DISTINCT
  ?lexeme

diff --git a/...extraction/Hindi/verbs/query_verbs.sparql → ...Hindustani/Hindi/verbs/query_verbs.sparql b/...extraction/Hindi/verbs/query_verbs.sparql → ...Hindustani/Hindi/verbs/query_verbs.sparql
@@ -1,7 +1,8 @@
 # tool: scribe-data
 # All Hindi (from Hindustani Q11051) verbs and the currently implemented forms for each.
 # Enter this query at https://query.wikidata.org/.
-# Note the necessity to filter for "hi" to remove Urdu (ur) words.
+
+# Note: We need to filter for "hi" to remove Urdu (ur) words.
 
 SELECT DISTINCT
  (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)

diff --git a/...n/Urdu/adjectives/query_adjectives.sparql → ...i/Urdu/adjectives/query_adjectives.sparql b/...n/Urdu/adjectives/query_adjectives.sparql → ...i/Urdu/adjectives/query_adjectives.sparql
@@ -1,10 +1,14 @@
 # tool: scribe-data
-# All Urduo (from Hindustani Q11051) adjectives.
+# All Urdu (from Hindustani Q11051) adjectives with the included grammatical forms.
 # Enter this query at https://query.wikidata.org/.
 
+# Note: We need to filter for "ur" to remove Hindi (hi) words.
+
 SELECT DISTINCT
  (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
- ?lemma
+ ?adjective
+ ?singulativeNumeral
+ ?collectiveNumeral
  ?femSingularDirect
  ?masSingularDirect
  ?femPluralDirect
@@ -21,8 +25,26 @@ SELECT DISTINCT
 WHERE {
  ?lexeme dct:language wd:Q11051 ;
  wikibase:lexicalCategory wd:Q34698 ;
- wikibase:lemma ?lemma .
- FILTER(lang(?lemma) = "ur")
+ wikibase:lemma ?adjective .
+ FILTER(lang(?adjective) = "ur")
+
+ # MARK: Singulative Numeral
+
+ OPTIONAL {
+ ?lexeme ontolex:lexicalForm ?singulativeNumeralForm .
+ ?singulativeNumeralForm ontolex:representation ?singulativeNumeral ;
+ wikibase:grammaticalFeature wd:Q110786 .
+ FILTER(LANG(?singulativeNumeral) = "ur")
+ }
+
+ # MARK: Collective Numeral
+
+ OPTIONAL {
+ ?lexeme ontolex:lexicalForm ?collectiveNumeralForm .
+ ?collectiveNumeralForm ontolex:representation ?collectiveNumeral ;
+ wikibase:grammaticalFeature wd:Q146786 .
+ FILTER(LANG(?collectiveNumeral) = "ur")
+ }
 
  # MARK: Direct
 

diff --git a/...raction/Urdu/adverbs/query_adverbs.sparql → ...dustani/Urdu/adverbs/query_adverbs.sparql b/...raction/Urdu/adverbs/query_adverbs.sparql → ...dustani/Urdu/adverbs/query_adverbs.sparql
@@ -1,14 +1,15 @@
 # tool: scribe-data
 # All Urdu (from Hindustani Q11051) adverbs.
 # Enter this query at https://query.wikidata.org/.
-# Note the necessity to filter for "ur" to remove Hindi (hi) words.
+
+# Note: We need to filter for "ur" to remove Hindi (hi) words.
 
 SELECT DISTINCT
  (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
  ?adverb
 
 WHERE {
- ?lexeme dct:language wd:Q11051 ; # Urdu language (from Hindustani)
+ ?lexeme dct:language wd:Q11051 ;
  wikibase:lexicalCategory wd:Q380057 ;
  wikibase:lemma ?adverb .
  FILTER(lang(?adverb) = "ur")

diff --git a/..._extraction/Urdu/nouns/query_nouns.sparql → .../Hindustani/Urdu/nouns/query_nouns.sparql b/..._extraction/Urdu/nouns/query_nouns.sparql → .../Hindustani/Urdu/nouns/query_nouns.sparql
@@ -1,7 +1,8 @@
 # tool: scribe-data
 # All Urdu (from Hindustani Q11051) nouns and their gender.
 # Enter this query at https://query.wikidata.org/.
-# Note the necessity to filter for "ur" to remove Hindi (hi) words.
+
+# Note: We need to filter for "ur" to remove Hindi (hi) words.
 
 SELECT DISTINCT
  (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)

diff --git a/...be_data/language_data_extraction/Hindustani/Urdu/postpositions/query_postpositions.sparql b/...be_data/language_data_extraction/Hindustani/Urdu/postpositions/query_postpositions.sparql
@@ -0,0 +1,17 @@
+# tool: scribe-data
+# All Urdu (from Hindustani Q11051) postpositions.
+# Enter this query at https://query.wikidata.org/.
+
+# Note: We need to filter for "ur" to remove Hindi (hi) words.
+
+SELECT DISTINCT
+ ?lexeme
+ (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
+ ?postposition
+
+WHERE {
+ ?lexeme dct:language wd:Q11051 ;
+ wikibase:lexicalCategory wd:Q161873 ;
+ wikibase:lemma ?postposition .
+ FILTER(lang(?postposition) = "ur")
+}
diff --git a/..._extraction/Urdu/verbs/query_verbs.sparql → .../Hindustani/Urdu/verbs/query_verbs.sparql b/..._extraction/Urdu/verbs/query_verbs.sparql → .../Hindustani/Urdu/verbs/query_verbs.sparql
@@ -1,7 +1,8 @@
 # tool: scribe-data
 # All Urdu (from Hindustani Q11051) verbs and the currently implemented conjugations for each.
 # Enter this query at https://query.wikidata.org/.
-# Note the necessity to filter for "ur" to remove Hindustani (hi) words.
+
+# Note: We need to filter for "ur" to remove Hindustani (hi) words.
 
 SELECT DISTINCT
  (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)

diff --git a/src/scribe_data/language_data_extraction/Punjabi/nouns/query_nouns.sparql b/src/scribe_data/language_data_extraction/Punjabi/nouns/query_nouns.sparql
@@ -1,7 +1,8 @@
 # tool: scribe-data
 # All Gurmukhi (from Punjabi Q58635) nouns, their plurals and their genders.
 # Enter this query at https://query.wikidata.org/.
-# Note the necessity to filter for "pa" to select Gurmukhi words.
+
+# Note: We need to filter for "pa" to select Gurmukhi words.
 
 SELECT DISTINCT
  (REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)