-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_transcriptions.py
114 lines (88 loc) · 4.09 KB
/
get_transcriptions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from selenium import webdriver
from selenium.webdriver.common.by import By
from tqdm import tqdm
from pathlib import Path
import re
import html
import json
# selenium setup
options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
driver = webdriver.Chrome(options=options)
# get links folder path
links_folder = Path('./links')
# retrieve links from all files
all_links = dict()
for link_file in links_folder.iterdir():
# get quadro from file name
quadro = link_file.stem
# read links from file
with link_file.open('r') as links_file:
links_plain = links_file.read()
all_links[quadro] = links_plain.split('\n')[:-1]
# extract/crawl transcriptions
all_transcriptions = {}
for quadro, links in all_links.items():
all_transcriptions[quadro] = []
for link in tqdm(links):
try:
driver.get(link)
# extract transcription from html
texts_div = driver.find_element(By.CLASS_NAME, 'content-inner')
html_content = texts_div.get_attribute("innerHTML") # get HTML content of text
html_content = re.sub(re.compile(' '), ' ', html_content) # replace (non-breaking spaces) with normal spaces
html_content = re.sub(re.compile('<br>'), '\n', html_content) # replace new line HTML tag with new line character
html_content = html.unescape(html_content) # unescape other HTML entities like
html_content = re.sub(re.compile('<.*?>'), '', html_content) # remove all remaining HTML tags
# set full transcription
full_transcription = f"{html_content}"
# split transcription into header and content
if 'TRANSCRIÇÃO' in full_transcription:
raw_transcription_header, raw_transcription_content = full_transcription.split('TRANSCRIÇÃO', maxsplit=1)
elif '[vinheta]' in full_transcription:
raw_transcription_header, raw_transcription_content = full_transcription.split('[vinheta]', maxsplit=1)
raw_transcription_content = f"[vinheta]{raw_transcription_content}"
else:
raise Exception('Could not find split key for transcription')
transcription_header = raw_transcription_header.strip().split('\n')
transcription_content = raw_transcription_content.strip()
# indicate header metadata
header_metadata = [
{
'field': 'title',
'prefix': 'título: ',
},
{
'field': 'publishing_date',
'prefix': 'data de publicação: ',
},
{
'field': 'quadro',
'prefix': 'quadro: ',
},
{
'field': 'hashtag',
'prefix': 'hashtag: ',
},
{
'field': 'characters',
'prefix': 'personagens: ',
}
]
# get transcription attributes
transcription = dict()
for header in transcription_header:
for metadata in header_metadata:
if header.startswith(metadata['prefix']):
transcription[metadata['field']] = header.removeprefix(metadata['prefix'])
transcription['transcription'] = transcription_content
# add episode transcription to all transcriptions
all_transcriptions[quadro].append(transcription)
except Exception as e:
print(f'Error extracting transcription from: {link}. Error: {e}')
# save transcriptions to json file
for quadro, transcriptions in all_transcriptions.items():
transcriptions_file = Path(f"./transcriptions/{quadro}.json")
transcriptions_file.parent.mkdir(exist_ok=True, parents=True)
with transcriptions_file.open("w", encoding='utf-8') as json_file:
json.dump(transcriptions, json_file, indent=4, ensure_ascii=False)