-
Notifications
You must be signed in to change notification settings - Fork 16
/
json2xls.py
48 lines (40 loc) · 1.68 KB
/
json2xls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import xlsxwriter
import tweet_cleaner
import json
import argparse
parser = argparse.ArgumentParser(description='extract tweet from json and write them into xls file')
parser.add_argument('-i', '--json-file', type=argparse.FileType(mode='r', encoding='utf-8'), help='input json file.', required=True)
parser.add_argument('-o', '--out-file', type=str, help='the output file.', required=True)
def process_json(json_file, xls_file):
workbook = xlsxwriter.Workbook(xls_file)
worksheet = workbook.add_worksheet()
# Start from the first cell. Rows and columns are zero indexed.
row = 0
col = 0
worksheet.write(row, 0, 'id')
worksheet.write(row, 1, 'created_at')
worksheet.write(row, 2, 'full_text')
worksheet.write(row, 3, 'clean_text')
row += 1
lines = json_file.readlines()
for line in lines:
json_tweet = json.loads(line)
if 'retweeted_status' in json_tweet:
text = json_tweet['retweeted_status']['full_text']
else:
text = json_tweet['full_text']
clean_text = tweet_cleaner.clean_tweet(text)
clean_text = tweet_cleaner.normalize_arabic(clean_text)
clean_text = tweet_cleaner.remove_repeating_char(clean_text)
clean_text = tweet_cleaner.keep_only_arabic(clean_text.split())
worksheet.write(row, col, json_tweet['id_str'])
worksheet.write(row, col + 1, json_tweet['created_at'])
worksheet.write(row, col + 2, text)
worksheet.write(row, col + 3, clean_text)
row += 1
workbook.close()
if __name__ == '__main__':
args = parser.parse_args()
json_file = args.json_file
xls_file = args.out_file
process_json(json_file, xls_file)