Skip to content

Commit

Permalink
Merge pull request #111 from CS3219-AY2223S1/ezek/scraper-scripts
Browse files Browse the repository at this point in the history
Add scraper script
  • Loading branch information
glennljw authored Nov 9, 2022
2 parents de62e18 + b5fa619 commit 3c39d4a
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 0 deletions.
3 changes: 3 additions & 0 deletions scripts/scrape-questions/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
env/*
venv/*
*.json
32 changes: 32 additions & 0 deletions scripts/scrape-questions/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import requests
from bs4 import BeautifulSoup as bs
import unicodedata
import json

url = "https://bishalsarang.github.io/Leetcode-Questions/out.html"

html_text = requests.get(url).text

soup = bs(html_text, 'html.parser')

titles = soup.find_all(id="title")

result = []

for title in titles:
title_string = str(title.string).split('. ')[1]

# Find the content of this questions
content = title.find_next_sibling().findChild()
clean_content = str(content).replace('\n', '<br/>')
clean_content = clean_content.replace('"', "'")
normalized_content = unicodedata.normalize('NFKD', clean_content)

result.append({
'title': unicodedata.normalize('NFKD', title_string),
'description': normalized_content
})

# Retrieve the outfile as json file to dump into mongodb
with open("questions.json", "w") as outfile:
json.dump(result, outfile)
7 changes: 7 additions & 0 deletions scripts/scrape-questions/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
beautifulsoup4==4.8.1
certifi==2022.9.24
charset-normalizer==2.1.1
idna==3.4
requests==2.28.1
soupsieve==2.3.2.post1
urllib3==1.26.12

1 comment on commit 3c39d4a

@vercel
Copy link

@vercel vercel bot commented on 3c39d4a Nov 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.