-
Notifications
You must be signed in to change notification settings - Fork 1
/
webscrape.py
90 lines (66 loc) · 2.39 KB
/
webscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import requests
import urllib.parse
#import pandas as pd
#import numpy as np
from cs50 import SQL
from functools import wraps
from bs4 import BeautifulSoup
# Configure CS50 Library to use SQLite database
db = SQL("sqlite:///webscrape.db")
""" 1. Web Scraping """
def bswebscrape(html_site):
# Beautiful soup to webscrape
web_input = requests.get(html_site)
if web_input.status_code > 399:
return 1
soup = BeautifulSoup(web_input.text, 'lxml')
contents = soup.find_all('p')
# Store data in a new list
scraped_data = []
for content in contents:
scraped_data.append(content.text)
return scraped_data
""" 2. Data Cleaning """
def data_clean(scraped_data):
# Declare dictionary to keep track of words - dict is the fastest
unique_wordlist = {}
word_count = 0
# Data is return in paragraphs
for paragraphs in scraped_data:
# Split into lists
paragraphs = paragraphs.split(" ")
# Split
for x in range(len(paragraphs)):
# Remove punctuation
paragraphs[x] = paragraphs[x].replace(",", "")
paragraphs[x] = paragraphs[x].replace(".", "")
paragraphs[x] = paragraphs[x].replace("!", "")
paragraphs[x] = paragraphs[x].replace("?", "")
paragraphs[x] = paragraphs[x].replace("“", "")
paragraphs[x] = paragraphs[x].replace("”", "")
paragraphs[x] = paragraphs[x].replace("(", "")
paragraphs[x] = paragraphs[x].replace(")", "")
for words in paragraphs:
# Insert new row
if words not in unique_wordlist:
unique_wordlist.update({words: 1})
word_count += 1
# Update row count
else:
int_increment = int(unique_wordlist[words]) + 1
unique_wordlist.update({words : int_increment})
word_count += 1
return {"WordCount" : word_count, "Data" : unique_wordlist}
""" 4. Update SQL Database """
def webscrapedb_update(unique_wordlist):
# Update SQL database
counter = 0
for word in unique_wordlist:
counter += 1
sqlcmd = db.execute("INSERT INTO word_list VALUES (?, ?, ?)", counter, str(word), int(unique_wordlist[word]))
return 0
""" 5. Delete All Rows in SQL Database """
def webscrapedb_delete():
sqlcmd = db.execute("DELETE FROM word_list")
return 0