-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpastebin_scrape_com.py
271 lines (225 loc) · 9.86 KB
/
pastebin_scrape_com.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
'''
Pastebin scraping permissions
1x per minute the 100 most recent pastes
store keys then fetch all pastes to process
recent pasts return in standard JSON
https://scrape.pastebin.com/api_scraping.php?limit=<1-250>
Default amount is 50, but max can be 250
can use API
DO NOT GO OVER 100 PER MINUTE OR WILL BE BLOCKED FOR 24 HOURS
TODO/LOGIC:
- get_recent_pastes() returns a list of dicts
locally store the keys of the pastes for ~ 200? pastes
fetch all pastes and compare for regex
ship off valid pastes to Elastic
Add UI
Add screenshot functionality for links
Add bulk delete in UI
Add user auth
convert epoch date to human readable
filter out HTML
add size limit to indexing pastes
add other site scraping
'''
import requests
import json
from pprint import pprint
from tqdm import tqdm
from colorama import Fore, Back
from time import sleep
import re
from datetime import datetime
from elasticsearch import Elasticsearch
from datetime import datetime
import yara
###========================YARA========================###
rules = yara.compile(filepath='yara/index.yar')
###========================ELASTIC========================###
es = Elasticsearch("http://104.248.115.167:9200")
# print (es.info().body)
mappings = {
"properties": {
"key": {"type": "text", "analyzer": "standard"},
"date": {"type": "date", "analyzer": "date_hour_minute_second"},
"title": {"type": "text", "analyzer": "standard"},
"user": {"type": "text", "analyzer": "standard"},
"syntax": {"type": "text", "analyzer": "standard"},
"content": {"type": "text", "analyzer": "standard"},
"extracted_urls": {"type": "text", "analyzer": "standard"},
"extracted_ips": {"type": "text", "analyzer": "standard"},
"extracted_emails": {"type": "text", "analyzer": "standard"},
"extracted_common_passwords": {"type": "text", "analyzer": "standard"},
"extracted_common_ssn": {"type": "text", "analyzer": "standard"},
"hit_type": {"type": "text", "analyzer": "standard"},
"keyword_match": {"type": "text", "analyzer": "standard"},
"expire": {"type": "date", "format": "date_hour_minute_second"},
"yara_match_rules": {"type": "text", "analyzer": "standard"},
"source": {"type": "text", "analyzer": "standard"},
}
}
# Make new elastic index if not exists
try:
es.indices.create(index="pastebin_scrape_com", mappings=mappings)
except:
print("Index already exists")
pass
# grab keywords from search_terms.txt file and add them to keywords list
keywords = []
try:
keywords = open('search_terms.txt', 'r').read().split('\n')
except:
print("No search_terms.txt file found")
# grab company names from company_names.txt file. 1 per line
try:
company_names = open('company_names.txt', 'r').read().split('\n')
keywords = keywords + company_names
except:
print("No company_names.txt file found")
# On linux, '' was being added to the search terms making the search match on everything
# This removes any empty strings from the list
while("" in keywords):
keywords.remove('')
###========================GLOBALS========================###
# List to track searched keys
# Tracks last 200 keys
searched_keys = []
# List of syntaxes to ignore
ignore_syntax = ['html', 'css', 'js', 'java', 'swift', 'lua']
# List of titles to ignore
ignore_titles = ['lab','algorithm','assignment']
###========================FUNCTIONS========================###
def get_recent_pastes(limit=100):
# Returns a list of the most recent pastes (default 50)
url = f'https://scrape.pastebin.com/api_scraping.php?limit={limit}'
response = requests.get(url)
val = response.json()
return val
def get_paste_content(key):
# Sleep to avoid rate limiting
sleep(.25)
scrape_url = "https://scrape.pastebin.com/api_scrape_item.php?i=" + key
response = requests.get(scrape_url)
return [response.text, scrape_url]
def send_to_elastic(key, date, title, user, syntax, content, hit_type, keyword_match, expire,yara_matches):
doc = {
"key": key,
"date": date,
"title": title,
"user": user,
"syntax": syntax,
"content": content,
"extracted_urls": re.findall("(?P<url>https?://[^\s'\"]+)", content),
"extracted_ips": re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', content),
"extracted_emails": re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', content),
"extracted_common_passwords": re.findall(r'^(?=.*?[A-Z])(?=.*?[a-z])(?=.*?[0-9])(?=.*?[#?!@$ %^&*-]).{8,}$', content),
"extracted_common_ssn": re.findall(r'^(?!0{3})(?!6{3})[0-8]\d{2}-(?!0{2})\d{2}-(?!0{4})\d{4}$', content),
"hit_type": hit_type,
"keyword_match": keyword_match,
"expire": expire,
"yara_match_rules": yara_matches,
"source": "Pastebin.com API"
}
es.index(index="pastebin_scrape_com", document=doc)
###========================LUHNS=CHECKSUM========================###
def luhn_checksum(card_number):
def digits_of(n):
return [int(d) for d in str(n)]
digits = digits_of(card_number)
odd_digits = digits[-1::-2]
even_digits = digits[-2::-2]
checksum = 0
checksum += sum(odd_digits)
for d in even_digits:
checksum += sum(digits_of(d*2))
return checksum % 10
def search_paste_content(json_actual):
# Prints the content of the most recent pastes
hits = 0
# Check searched_keys buffer size
for paste in json_actual:
keyword_match = ""
hit_type = ""
yara_match_type = ""
if len(searched_keys) >= 200:
searched_keys.pop(0)
if paste['key'] not in searched_keys:
searched_keys.append(paste['key'])
paste_data = get_paste_content(paste['key'])[0]
# Check for YARA matches on ALL pastes
yara_matches = rules.match(data=paste_data)
if yara_matches:
hit_type += "YARA, "
yara_match_type = ''.join(str(e) for e in yara_matches)
urls = re.findall(r'(https?://\S+)', paste_data)
ips = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', paste_data)
emails = re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', paste_data)
passwords = re.findall(r'^(?=.*?[A-Z])(?=.*?[a-z])(?=.*?[0-9])(?=.*?[#?!@$ %^&*-]).{8,}$',
paste_data)
ssn = re.findall(r'^(?!0{3})(?!6{3})[0-8]\d{2}-(?!0{2})\d{2}-(?!0{4})\d{4}$', paste_data)
# Ignore junk syntax for keyword search if it's not a yara match, or no other regex criteria is met
if any([x in paste['syntax'].lower() for x in ignore_syntax]) and (not yara_matches or not urls or not ips or not emails or not passwords or not ssn):
pass
if any([x in paste['title'].lower() for x in ignore_titles]):
pass
# Start of keyword searching logic
else:
for word in keywords:
###========================KEYWORD=REGEX========================###
if word in paste_data.lower():
keyword_match += (word + ", ")
# print(keyword_match)
hits += 1
if "KEYWORD" not in hit_type:
hit_type += "KEYWORD, "
###========================CC=REGEX========================###
#Pull suspected CC numbers out of paste
cc_match = re.findall(r'(?:\d{4}[ \-]?){3}\d{4}', paste_data)
#Luhns algorithm to validate
for cc in cc_match:
if luhn_checksum(cc) == 0 and str(cc)[0] != '0':
keyword_match += (str(cc) + ", ")
print("cc match: "+ str(cc))
hit_type += "CC"
hits += 1
if "CC" not in hit_type:
hit_type += "CC, "
'''REMOVED DUE TO YARA RULE COVERAGE
###========================BTC=REGEX========================###
btc_address_match = re.findall(r'/^[13][a-km-zA-HJ-NP-Z1-9]{25,34}$/g', paste_data)
if btc_address_match:
keyword_match += (str(btc_address_match) + " ")
print("btc match: "+ str(btc_address_match))
hit_type += "BITCOIN_ADDRESS"
hits += 1
if "BITCOIN_ADDRESS" not in hit_type:
hit_type += "BITCOIN_ADDRESS "
'''
###========================BASE64=REGEX========================###
base64 = re.findall(r'^@(?=(.{4})*$)[A-Za-z0-9+/]*={0,2}$', paste_data)
if base64:
keyword_match += (str(base64) + ", ")
print("base64 match: " + str(base64))
hit_type += "BASE64"
hits += 1
if "BASE64" not in hit_type:
hit_type += "BASE64, "
if hit_type != "":
send_to_elastic(paste['key'], datetime.fromtimestamp(int(paste['date'])), paste['title'],
paste['user'], paste['syntax'], paste_data, hit_type, keyword_match,
datetime.fromtimestamp(int(paste['expire'])),yara_match_type)
else:
pass
# Main loop
def scrape():
print("Starting scrape searching for the following keywords and company names:")
print(keywords)
print("\nExcluding the following syntax types from keyword search:")
print(ignore_syntax)
while True:
try:
search_paste_content(get_recent_pastes(100))
except Exception as e:
print("Error: ", e)
sleep(60)
if __name__ == "__main__":
scrape()