-
Notifications
You must be signed in to change notification settings - Fork 0
/
EUScrapping.py
170 lines (125 loc) · 6.91 KB
/
EUScrapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import PatentData
import pymongo
from PatentData import *
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import Configuration_Data
import MongoDBStore
from pymongo import collection
import time
import json
import os
import logging
#Contains all the functions and called in main when required
class Scrapper:
#Function to fetch data for the given application number
def GetBiblioData(driver):
try:
logging.info("Starting Data scrapping")
#Mapping the fetched Application Number to Application Number data in ContinuityDatum class in PatentData
ApplicationNumberObj=ContinuityDatum()
ApplicationNumberObj.application_number=driver.find_element_by_xpath(Configuration_Data.app_no_obj).text
#Mapping the fetched Filing Date and Patent Status to Filing Date and Patent Status data in PatentCaseMetaData class in PatentData
biblioObj1=PatentCaseMetaData()
biblioObj1.filing_date=driver.find_element_by_xpath(Configuration_Data.filing_date_obj).text
biblioObj1.patent_status=driver.find_element_by_xpath(Configuration_Data.status).text
#Mapping the fetched Publication Date and Number to Publication Date and Number data in PatentPublicationIdentification class in PatentData
PublicationObj=PatentPublicationIdentification()
PublicationObj.publication_date=driver.find_element_by_xpath(Configuration_Data.publication_date_obj).text
PublicationObj.publication_number=driver.find_element_by_xpath(Configuration_Data.publication_no_obj).text
biblioObj1.patent_publication_identification=PublicationObj
#Mapping the fetched Inventor(s) to Inventor data in Parties class in PatentData
InventorObj=Parties()
Inventor=Applicant()
Inventor.full_name=driver.find_element_by_xpath(Configuration_Data.inventor_obj).text
"""Since the applicants field is a list, creating an empty list and appending the name of the inventors in"""
InventorObj.inventors=[]
InventorObj.inventors.append(Inventor)
biblioObj1.parties=InventorObj
#Mapping the fetched CPC_Classification to Classifications_CPC data in TechnicalData class in PatentData
ClassificationObj=TechnicalData()
ClassificationObj.classifications_cpc=driver.find_element_by_xpath(Configuration_Data.cpc_obj).text
biblioObj1.technical_data=ClassificationObj
logging.info("Data scrapping completed successfully")
return biblioObj1
except Exception as e:
print("Error retrieving data")
logging.error("Error occured during Data scrapping")
driver.quit()
#Function which parses through the webpage from where the data needs to be extracted
def navigate(driver):
try:
logging.info("Starting the parsing through the webpage")
#Places the cursor the Application Number text field
search=driver.find_element_by_xpath(Configuration_Data.application_field)
#Clearing the text field of the previous Application Number so that the next number can be entered
search.clear()
#Enters the application number(s) (from the textfile) in the specified field
search.send_keys(app_no)
time.sleep(2)
#Clicks on the search button at the end of the page
driver.find_element_by_xpath(Configuration_Data.search_button).click()
time.sleep(2)
logging.info("Parsing done successfully")
return driver
except Exception as e:
print("Parsing cannot be done")
logging.error("Error occured during webpage parsing")
driver.quit()
#Function which converts the extracted python object into a JSON object
def toJSON(reference):
logging.info("Starting JSON conversion")
logging.info("JSON conversion successful")
return json.dumps(reference, default=lambda o: o.__dict__,sort_keys=False, indent=4)
if __name__=='__main__':
try:
#Creating a logger file to keep track of the steps
logging.basicConfig(filename='EUScrape.log', level=logging.INFO,format='%(asctime)s:%(levelname)s:%(message)s')
logging.info("Opening and reading the text file")
#Reading the text file which contains the required application numbers
os.chdir(r'C:\Users\ojus.g_maxval\OjusG\Europe Scrape')
f1=open('appnos.txt', 'r')
#Read the application numbers from the text file 'appnos.txt'
appno=f1.readlines()
logging.info("Automation starts")
#Setting up the webdriver to read the EU Patent website
driver= webdriver.Chrome(r"C:\\Users\\ojus.g_maxval\\Downloads\\chromedriver.exe")
driver.get("https://register.epo.org/advancedSearch?lng=en")
time.sleep(2)
#To run the process in loop for multiple appication numbers
for i in appno:
logging.info("Main process starts")
#Strips the application number from unnecessary spaces inserted by readlines or manually
app_no=i.strip()
#Calling the navigate funtion under the Scapper class to execute
Scrapper.navigate(driver)
#Calling the GetBiblioData funtion under Scapper class to execute
o=Scrapper.GetBiblioData(driver)
#Calling the toJSON function under Scapper class to execute
jsonstr=Scrapper.toJSON(o)
time.sleep(2)
driver.close()
print(jsonstr)
#Creating a separate JSON file for each application number
with open(app_no+ '.json', 'w') as f:
logging.info("Creating JSON")
f.write(jsonstr)
f.close
#To push the JSON file into MongoDB
with open(app_no+ '.json', 'r') as f1:
logging.info("Pushing JSON data into MongoDB")
file_data = json.load(f1)
#Establishing connection with MongoDB
mongo=MongoDBStore.Mongo(Configuration_Data.connection_string)
#Getting database and collection name to insert data in desired collection in the database
col=mongo.getCol('Europe','JSON_Data')
#Inserting document into MongoDB
mongo.insertOne(file_data,col)
logging.info("Data successfully pushed into MongoDB")
except Exception as e:
print("Error, please check the code.")
logging.error('Exception error occurred',exc_info=True)
driver.quit()