-
Notifications
You must be signed in to change notification settings - Fork 1
/
patent_data_scraper.py
99 lines (84 loc) · 3.76 KB
/
patent_data_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import pandas as pd
from requests_html import HTMLSession
import re
from tqdm import trange
class Scraper:
def __init__(self):
self.session = HTMLSession
self.df_todo = None
self.url_0 = "https://patents.google.com/patent/"
self.pattern = r"^[A-Za-z]\d{2,}[A-Za-z]\d+\/\d{2,}$"
def scrape_all(self, file_path, export_path, id_column, start_idx=0, end_idx=0):
"""
Scrapes the cpc, date, inventor, and title of the patents from the google patent website.
Args:
file_path (str): the ids of patents to be scraped csv file path
export_path (_type_): output path
id_column (_type_): the column name of the ids
start_idx (int, optional): index to begin with in the csv. Defaults to 0.
end_idx (int, optional): index to end with in the csv. Defaults to 0.
"""
self.df_todo = pd.read_csv(file_path, low_memory=False)
self.df_todo['cpc'] = ''
self.df_todo['date'] = ''
self.df_todo['inventor'] = ''
self.df_todo['title'] = ''
id_old = ""
if end_idx != 0:
end_idx = len(self.df_todo)
for i in trange(start_idx, end_idx):
id_new = df_todo[id_column][i].replace('-', '')
if id_old != id_new:
id_old = id_new
url = self.url_0 + id_new
r = self.session.get(url)
r.html.render()
self.df_todo['cpc'][i] = self.get_cpc(r)
self.df_todo['date'][i] = self.get_date(r)
self.df_todo['inventor'][i] = self.get_inventor(r)
self.df_todo['title'][i] = self.get_title(r)
else:
self.df_todo['cpc'][i] = self.df_todo['cpc'][i-1]
self.df_todo['date'][i] = self.df_todo['date'][i-1]
self.df_todo['inventor'][i] = self.df_todo['inventor'][i-1]
self.df_todo['title'][i] = self.df_todo['title'][i-1]
self.df_todo.to_csv(export_path, index=False)
def get_cpc(self, r):
classification_element = r.html.find('.style-scope.classification-tree')
unique_elements = set()
for j in range(len(classification_element)):
text = classification_element[j].text
if re.match(self.pattern, text):
if text not in unique_elements:
unique_elements.add(text)
print("the cpc is", text)
else:
continue
return unique_elements
def get_date(self, r):
events = r.html.find('.event.layout.horizontal.style-scope.application-timeline')
date_list = []
info_list = []
for event in events:
# Attempting to find the date in various possible classes
date_element = None
for date_class in ['filed', 'priority', 'reassignment', 'granted', 'publication', 'legal-status']:
date_element = event.find(f'.{date_class}.style-scope.application-timeline', first=True)
if date_element:
break
return date_element
def get_inventor(self, r):
inventor_element = r.html.find('dd.style-scope.patent-result state-modifier')
list_name = []
for element in inventor_element:
if 'data-inventor' in element.attrs:
list_name.append(element.attrs['data-inventor'])
return list_name
def get_title(self, r):
title_element = r.html.find('h1')
try:
title = title_element[1].text.strip()
except:
title = 'error'
print("title not found")
return title