-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdevelopment.py
86 lines (81 loc) · 3.05 KB
/
development.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""Function(s) that handle the development of data structures."""
import string
import os
import nltk
from nltk.corpus import stopwords
from itertools import combinations
import xml.etree.ElementTree as ET
def develop_search_lists(search):
"""Develop a list of lists containing strings to search for."""
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# split the search string
word_list = search.lower().split()
# remove stop words
for word in word_list:
if word in stop_words:
word_list.remove(word)
# remove punctuation
for index in range(len(word_list)):
for char in string.punctuation:
word_list[index] = word_list[index].replace(char, "")
# create combination of search words
search_lists = []
r = list(range(len(word_list)))
r.reverse()
for index in r:
x = combinations(word_list, index + 1)
combination = [" ".join(i) for i in x]
search_lists.append(combination)
return search_lists
def develop_corpus(directory):
"""Develop a list of dictionaries containing the articles' information."""
corpus = []
# iterate through all xml files in the corpus
for filename in os.listdir(directory):
# create empty dictionary for new article
article = {}
# create the full path for the current file
path = os.path.join(directory, filename)
# create element tree for the current file
tree = ET.parse(path)
# find root of the element tree
root = tree.getroot()
# Find the title of the article
title = root.find(".//article-title")
subtitle = root.find(".//subtitle")
if type(title) and type(subtitle) != type(None):
article["Title"] = title.text + ": " + subtitle.text
elif type(title) != type(None):
article["Title"] = title.text
else:
article["Title"] = "None"
# find the publication date of the article
month = root.find(".//month")
day = root.find(".//day")
year = root.find(".//year")
if type(day) != type(None):
article["Date"] = month.text + "/" + day.text + "/" + year.text
else:
article["Date"] = month.text + "/" + year.text
# find the authors of the article
authors = []
surname = root.findall(".//surname")
given_names = root.findall(".//given-names")
for name in surname:
authors.append(name.text)
count = 0
for name in given_names:
if type(name.text) != type(None):
authors[count] = authors[count] + ", " + name.text
count += 1
article["Author(s)"] = authors
# find all paragraphs within the article
paragraphs = []
for paragraph in root.findall(".//p"):
if type(paragraph.text) != type(None):
paragraphs.append(paragraph.text.lower())
article["Content"] = paragraphs
# add article to corpus
corpus.append(article)
return corpus