-
Notifications
You must be signed in to change notification settings - Fork 0
/
flashcards.py
52 lines (46 loc) · 2.07 KB
/
flashcards.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from collections import Counter
import importlib
import csv
heb_vocab = importlib.import_module("hebrew-vocab-tools")
TokenType, ChunkType, get_tokens_by_chunk, get_tokens = heb_vocab.TokenType, heb_vocab.ChunkType, heb_vocab.get_tokens_by_chunk, heb_vocab.get_tokens
HEBLEX = heb_vocab.heb_lex_tools.HEBLEX
# https://github.com/openscriptures/HebrewLexicon/blob/master/HebrewLexicon.pdf
GLOSSER = HEBLEX()
HEB_LEMMAS_COUNT = Counter(get_tokens(TokenType.lemma))
HEB_LEMMAS_MORPH = dict((y, x)
for x, y in get_tokens(TokenType.morph_lemma))
def get_gloss(strongs):
"""return a gloss and morph code, unless is a proper noun"""
morph_code = HEB_LEMMAS_MORPH[strongs]
# is it a personal name?
if morph_code.find('Np') != -1:
return "Proper Noun"
gloss = GLOSSER.strongs_to_gloss(strongs)
# if no gloss, return morph code
if not gloss:
return morph_code
return f"{gloss} ({morph_code})"
def create_csv(book_id, less_than_count=100, chap_start=None, chap_end=None):
"""
Create csv files containing lemmas used less than X number of times in a book
"""
chunks = []
if chap_start and chap_end:
chaps = range(chap_start, chap_end + 1)
for chap in chaps:
chunks.extend(get_tokens_by_chunk(TokenType.lemma, ChunkType.chapter)[f'{book_id}.{str(chap)}'])
else:
chunks.extend(get_tokens_by_chunk(TokenType.lemma, ChunkType.book)[book_id])
book_counts = Counter(chunks)
f = open(f"/tmp/{book_id}{'_'+ str(chap_start )+ '-' + str(chap_end) if chap_start else ''}_{less_than_count}_cards.csv", 'w', encoding="UTF-8")
writer = csv.writer(f)
writer.writerow(["Lemma", "Gloss"])
writer.writerow(["Example (Occurrences)", "Gloss (Parsing)"])
for (strongs, count) in book_counts.most_common():
total_count = HEB_LEMMAS_COUNT[strongs]
if total_count <= less_than_count:
lemma = f"{GLOSSER.strongs_to_lemma(strongs)} ({str(total_count)}x)"
gloss = get_gloss(strongs)
writer.writerow([lemma, gloss])
f.close()
return f