Skip to content

Commit

Permalink
#212 #238 Hindi/Urdu under Hindustani and combining queries
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtavis committed Oct 10, 2024
1 parent 213769a commit ca4698b
Show file tree
Hide file tree
Showing 15 changed files with 223 additions and 73 deletions.

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# tool: scribe-data
# All Hindi (from Hindustani Q11051) adjectives with the included grammatical forms.
# Enter this query at https://query.wikidata.org/.

# Note: We need to filter for "hi" to remove Urdu (ur) words.

SELECT DISTINCT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?adjective
?singulativeNumeral
?collectiveNumeral
?femSingularDirect
?masSingularDirect
?femPluralDirect
?masPluralDirect
?femSingularOblique
?masSingularOblique
?femPluralOblique
?masPluralOblique
?femSingularVocative
?masSingularVocative
?femPluralVocative
?masPluralVocative

WHERE {
?lexeme dct:language wd:Q11051 ;
wikibase:lexicalCategory wd:Q34698 ;
wikibase:lemma ?adjective .
FILTER(lang(?adjective) = "hi")

# MARK: Singulative Numeral

OPTIONAL {
?lexeme ontolex:lexicalForm ?singulativeNumeralForm .
?singulativeNumeralForm ontolex:representation ?singulativeNumeral ;
wikibase:grammaticalFeature wd:Q110786 .
FILTER(LANG(?singulativeNumeral) = "hi")
}

# MARK: Collective Numeral

OPTIONAL {
?lexeme ontolex:lexicalForm ?collectiveNumeralForm .
?collectiveNumeralForm ontolex:representation ?collectiveNumeral ;
wikibase:grammaticalFeature wd:Q146786 .
FILTER(LANG(?collectiveNumeral) = "hi")
}

# MARK: Direct

OPTIONAL {
?lexeme ontolex:lexicalForm ?femSingularDirectForm .
?femSingularDirectForm ontolex:representation ?femSingularDirect ;
wikibase:grammaticalFeature wd:Q1775415 ;
wikibase:grammaticalFeature wd:Q110786 ;
wikibase:grammaticalFeature wd:Q1751855 ;
FILTER(LANG(?femSingularDirect) = "hi")
} .

OPTIONAL {
?lexeme ontolex:lexicalForm ?masSingularDirectForm .
?masSingularDirectForm ontolex:representation ?masSingularDirect ;
wikibase:grammaticalFeature wd:Q499327 ;
wikibase:grammaticalFeature wd:Q110786 ;
wikibase:grammaticalFeature wd:Q1751855 ;
FILTER(LANG(?masSingularDirect) = "hi")
} .

OPTIONAL {
?lexeme ontolex:lexicalForm ?femPluralDirectForm .
?femPluralDirectForm ontolex:representation ?femPluralDirect ;
wikibase:grammaticalFeature wd:Q1775415 ;
wikibase:grammaticalFeature wd:Q146786 ;
wikibase:grammaticalFeature wd:Q1751855 ;
FILTER(LANG(?femPluralDirect) = "hi")
} .

OPTIONAL {
?lexeme ontolex:lexicalForm ?masPluralDirectForm .
?masPluralDirectForm ontolex:representation ?masPluralDirect ;
wikibase:grammaticalFeature wd:Q499327 ;
wikibase:grammaticalFeature wd:Q146786 ;
wikibase:grammaticalFeature wd:Q1751855 ;
FILTER(LANG(?masPluralDirect) = "hi")
} .

# MARK: Oblique

OPTIONAL {
?lexeme ontolex:lexicalForm ?femSingularObliqueForm .
?femSingularObliqueForm ontolex:representation ?femSingularOblique ;
wikibase:grammaticalFeature wd:Q1775415 ;
wikibase:grammaticalFeature wd:Q110786 ;
wikibase:grammaticalFeature wd:Q1233197 ;
FILTER(LANG(?femSingularOblique) = "hi")
} .

OPTIONAL {
?lexeme ontolex:lexicalForm ?masSingularObliqueForm .
?masSingularObliqueForm ontolex:representation ?masSingularOblique ;
wikibase:grammaticalFeature wd:Q499327 ;
wikibase:grammaticalFeature wd:Q110786 ;
wikibase:grammaticalFeature wd:Q1233197 ;
FILTER(LANG(?masSingularOblique) = "hi")
} .

OPTIONAL {
?lexeme ontolex:lexicalForm ?femPluralObliqueForm .
?femPluralObliqueForm ontolex:representation ?femPluralOblique ;
wikibase:grammaticalFeature wd:Q1775415 ;
wikibase:grammaticalFeature wd:Q146786 ;
wikibase:grammaticalFeature wd:Q1233197 ;
FILTER(LANG(?femPluralOblique) = "hi")
} .

OPTIONAL {
?lexeme ontolex:lexicalForm ?masPluralObliqueForm .
?masPluralObliqueForm ontolex:representation ?masPluralOblique ;
wikibase:grammaticalFeature wd:Q499327 ;
wikibase:grammaticalFeature wd:Q146786 ;
wikibase:grammaticalFeature wd:Q1233197 ;
FILTER(LANG(?masPluralOblique) = "hi")
} .

# MARK: Vocative

OPTIONAL {
?lexeme ontolex:lexicalForm ?femSingularVocativeForm .
?femSingularVocativeForm ontolex:representation ?femSingularVocative ;
wikibase:grammaticalFeature wd:Q1775415 ;
wikibase:grammaticalFeature wd:Q110786 ;
wikibase:grammaticalFeature wd:Q185077 ;
FILTER(LANG(?femSingularVocative) = "hi")
} .

OPTIONAL {
?lexeme ontolex:lexicalForm ?masSingularVocativeForm .
?masSingularVocativeForm ontolex:representation ?masSingularVocative ;
wikibase:grammaticalFeature wd:Q499327 ;
wikibase:grammaticalFeature wd:Q110786 ;
wikibase:grammaticalFeature wd:Q185077 ;
FILTER(LANG(?masSingularVocative) = "hi")
} .

OPTIONAL {
?lexeme ontolex:lexicalForm ?femPluralVocativeForm .
?femPluralVocativeForm ontolex:representation ?femPluralVocative ;
wikibase:grammaticalFeature wd:Q1775415 ;
wikibase:grammaticalFeature wd:Q146786 ;
wikibase:grammaticalFeature wd:Q185077 ;
FILTER(LANG(?femPluralVocative) = "hi")
} .

OPTIONAL {
?lexeme ontolex:lexicalForm ?masPluralVocativeForm .
?masPluralVocativeForm ontolex:representation ?masPluralVocative ;
wikibase:grammaticalFeature wd:Q499327 ;
wikibase:grammaticalFeature wd:Q146786 ;
wikibase:grammaticalFeature wd:Q185077 ;
FILTER(LANG(?masPluralVocative) = "hi")
} .
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# tool: scribe-data
# All Hindi (from Hindustani Q11051) adverbs.
# Enter this query at https://query.wikidata.org/.
# Note the necessity to filter for "hi" to remove Urdu (ur) words.

# Note: We need to filter for "hi" to remove Urdu (ur) words.

SELECT DISTINCT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
Expand Down
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@
from scribe_data.unicode.process_unicode import gen_emoji_lexicon
from scribe_data.utils import export_formatted_data

LANGUAGE = "Hindustani" # Broad language category
LANGUAGE_CODE = "hi" # Specific filter for Hindi
LANGUAGE = "Hindi"
DATA_TYPE = "emoji-keywords"
emojis_per_keyword = 3

Expand All @@ -38,7 +37,6 @@
if emoji_keywords_dict := gen_emoji_lexicon(
language=LANGUAGE,
emojis_per_keyword=emojis_per_keyword,
filter_language_code=LANGUAGE_CODE, # Adding filter for Hindi language code "hi"
):
export_formatted_data(
file_path=args.file_path,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# tool: scribe-data
# All Hindi (from Hindustani Q11051) nouns and their gender.
# Enter this query at https://query.wikidata.org/.
# Note the necessity to filter for "hi" to remove Urdu (ur) words.

# Note: We need to filter for "hi" to remove Urdu (ur) words.

SELECT DISTINCT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# tool: scribe-data
# All Hindi (from Hindustani Q11051) postpositions.
# Enter this query at https://query.wikidata.org/.
# Note the necessity to filter for "hi" to remove Urdu (ur) words.

# Note: We need to filter for "hi" to remove Urdu (ur) words.

SELECT DISTINCT
?lexeme
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# tool: scribe-data
# All Hindi (from Hindustani Q11051) verbs and the currently implemented forms for each.
# Enter this query at https://query.wikidata.org/.
# Note the necessity to filter for "hi" to remove Urdu (ur) words.

# Note: We need to filter for "hi" to remove Urdu (ur) words.

SELECT DISTINCT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
# tool: scribe-data
# All Urduo (from Hindustani Q11051) adjectives.
# All Urdu (from Hindustani Q11051) adjectives with the included grammatical forms.
# Enter this query at https://query.wikidata.org/.

# Note: We need to filter for "ur" to remove Hindi (hi) words.

SELECT DISTINCT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?lemma
?adjective
?singulativeNumeral
?collectiveNumeral
?femSingularDirect
?masSingularDirect
?femPluralDirect
Expand All @@ -21,8 +25,26 @@ SELECT DISTINCT
WHERE {
?lexeme dct:language wd:Q11051 ;
wikibase:lexicalCategory wd:Q34698 ;
wikibase:lemma ?lemma .
FILTER(lang(?lemma) = "ur")
wikibase:lemma ?adjective .
FILTER(lang(?adjective) = "ur")

# MARK: Singulative Numeral

OPTIONAL {
?lexeme ontolex:lexicalForm ?singulativeNumeralForm .
?singulativeNumeralForm ontolex:representation ?singulativeNumeral ;
wikibase:grammaticalFeature wd:Q110786 .
FILTER(LANG(?singulativeNumeral) = "ur")
}

# MARK: Collective Numeral

OPTIONAL {
?lexeme ontolex:lexicalForm ?collectiveNumeralForm .
?collectiveNumeralForm ontolex:representation ?collectiveNumeral ;
wikibase:grammaticalFeature wd:Q146786 .
FILTER(LANG(?collectiveNumeral) = "ur")
}

# MARK: Direct

Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
# tool: scribe-data
# All Urdu (from Hindustani Q11051) adverbs.
# Enter this query at https://query.wikidata.org/.
# Note the necessity to filter for "ur" to remove Hindi (hi) words.

# Note: We need to filter for "ur" to remove Hindi (hi) words.

SELECT DISTINCT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?adverb

WHERE {
?lexeme dct:language wd:Q11051 ; # Urdu language (from Hindustani)
?lexeme dct:language wd:Q11051 ;
wikibase:lexicalCategory wd:Q380057 ;
wikibase:lemma ?adverb .
FILTER(lang(?adverb) = "ur")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# tool: scribe-data
# All Urdu (from Hindustani Q11051) nouns and their gender.
# Enter this query at https://query.wikidata.org/.
# Note the necessity to filter for "ur" to remove Hindi (hi) words.

# Note: We need to filter for "ur" to remove Hindi (hi) words.

SELECT DISTINCT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# tool: scribe-data
# All Urdu (from Hindustani Q11051) postpositions.
# Enter this query at https://query.wikidata.org/.

# Note: We need to filter for "ur" to remove Hindi (hi) words.

SELECT DISTINCT
?lexeme
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
?postposition

WHERE {
?lexeme dct:language wd:Q11051 ;
wikibase:lexicalCategory wd:Q161873 ;
wikibase:lemma ?postposition .
FILTER(lang(?postposition) = "ur")
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# tool: scribe-data
# All Urdu (from Hindustani Q11051) verbs and the currently implemented conjugations for each.
# Enter this query at https://query.wikidata.org/.
# Note the necessity to filter for "ur" to remove Hindustani (hi) words.

# Note: We need to filter for "ur" to remove Hindustani (hi) words.

SELECT DISTINCT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# tool: scribe-data
# All Gurmukhi (from Punjabi Q58635) nouns, their plurals and their genders.
# Enter this query at https://query.wikidata.org/.
# Note the necessity to filter for "pa" to select Gurmukhi words.

# Note: We need to filter for "pa" to select Gurmukhi words.

SELECT DISTINCT
(REPLACE(STR(?lexeme), "http://www.wikidata.org/entity/", "") AS ?lexemeID)
Expand Down

0 comments on commit ca4698b

Please sign in to comment.