-
Notifications
You must be signed in to change notification settings - Fork 0
/
process-pageviews.py
executable file
·53 lines (45 loc) · 1.57 KB
/
process-pageviews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/bin/python
import sys
import glob
view_counts = {}
filenames = glob.glob('pageviews/*')
filters = ['File:', 'Wikipedia:', 'Help:', 'Category:', 'Portal:', 'Special:', 'Main_Page']
if len(sys.argv) > 1:
try:
threshold = int(sys.argv[1])
except ValueError:
print("Invalid threshold value. Using default value of 50.")
threshold = 50
else:
threshold = 50
with open('titles.txt', 'w') as out_file:
for filename in filenames:
print(f'Processing file: {filename}')
with open(filename, 'r') as f:
lines = f.readlines()
for line in lines:
parts = line.split()
page_name = parts[1]
try:
view_count = int(parts[2])
except ValueError:
continue
if view_count < 1:
continue
if parts[0] != "en.m":
continue
if any(filter in page_name for filter in filters):
continue
if page_name in view_counts:
view_counts[page_name] += view_count
else:
view_counts[page_name] = view_count
print('Sorting results...')
sorted_view_counts = sorted(view_counts.items(), key=lambda x: x[1], reverse=True)
for page_name, view_count in sorted_view_counts:
if view_count < threshold:
continue
page_name = page_name.strip()
page_name = page_name.replace("_", " ")
out_file.write(f'{page_name}\n')
print('Done!')