-
Notifications
You must be signed in to change notification settings - Fork 0
/
Standards_Search.py
77 lines (63 loc) · 2.86 KB
/
Standards_Search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# This is the link to use to get a list of the standards all on one page
## https://shop.bsigroup.com/SearchResults/?q=bs+5266&pg=1&no=100&c=100&t=p
import requests
from bs4 import BeautifulSoup
import re
import os
import pdfplumber
class Standards_Search_Tools:
#defining constructor
def __init__(self, filePath):
self.file_path = filePath
def return_list_of_standards(self, standard_name):
URL = f"https://shop.bsigroup.com/SearchResults/?q={standard_name}&pg=1&no=100&c=20&t=p"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id='MainFrame')
standards_lists = results.find_all('div', class_='resultsInd')
results_list = []
for _ in standards_lists:
title_name = _.find('h2', class_='H2SearchResultsTitle')
title_status = _.find('span', class_='text12Grey')
results_list.append(_.text.strip())
returned_list = []
for _ in results_list:
formatted_text = _.replace(" ", "")
formatted_text = formatted_text.replace("\n\n", "\n")
formatted_text = formatted_text.replace("\r\n", "")
formatted_text = formatted_text.replace("\xa0", " ")
formatted_text = formatted_text.strip('\r')
output_list = []
for item in formatted_text.splitlines():
output_list.append(item)
output_list[:] = [x for x in output_list if x]
returned_list.append(output_list)
return returned_list
def text_search_for_standards(self, input_text):
#regex = r"(BS|BS |BS EN|EN|EN |ISO|ISO |IEC|IEC |BSRIABG |DW)\d+"
regex = r"(BS|BS EN|EN|ISO|IEC|BSRIABG|DW) ?\d+"
matches = re.finditer(regex, input_text, re.MULTILINE)
list_of_standards_in_text = []
for matchNum, match in enumerate(matches, start=1):
list_of_standards_in_text.append(match.group())
print(list_of_standards_in_text)
list_of_standards_in_text = list(set(list_of_standards_in_text))
return list_of_standards_in_text
def extract_text_from_pdf(self):
with pdfplumber.open(self.file_path) as pdf:
full_text = ""
pages = pdf.pages
for i, pg in enumerate(pages):
text = pages[i].extract_text()
full_text += text
#first_page = pdf.pages[0]
#print(first_page.extract_text())
#print(full_text)
return full_text
def write_to_file(self, document_to_write):
filename_to_write = f"{self.file_path}.txt"
with open(filename_to_write, 'w', encoding="utf-8") as standards_review_doc:
standards_review_doc.write("Summary of Standards:")
standards_review_doc.write("""
""")
standards_review_doc.write(document_to_write)