-
Notifications
You must be signed in to change notification settings - Fork 1
/
wordlistgpt.py
356 lines (323 loc) · 17.6 KB
/
wordlistgpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
from concurrent.futures import ThreadPoolExecutor
from itertools import product
from time import perf_counter
from functools import reduce
import argparse
import requests
import logging
import random
import json
import re
import os
def parse_arguments():
parser = argparse.ArgumentParser(
description='''Generate wordlists using a variety of options.
Customize the output using arguments such as length, casing, leet speak, and more.
''',
epilog='''Examples:
python wordlistgpt.py -w "harry potter"
python wordlistgpt.py -w cybersecurity -n 50 -min 5 -max 15 -u 2 -l 3 -d 1 -r 1
''',
formatter_class=argparse.RawDescriptionHelpFormatter
)
word_options = parser.add_argument_group('Word Options', 'Control the basic word parameters.')
word_options.add_argument('-w', '--words', nargs='+', help='Words to generate wordlist for.')
word_options.add_argument('-n', '--number', type=int, default=20, help='Number of words to generate in ChatGPT for each word. (default: %(default)s)')
size_options = parser.add_argument_group('Size Options', 'Control the size of the words and the wordlist.')
size_options.add_argument('-min', '--min-size', type=int, default=6, help='Minimum amount of characters for each word. (default: %(default)s)')
size_options.add_argument('-max', '--max-size', type=int, default=14, help='Maximum amount of characters for each word. (default: %(default)s)')
size_options.add_argument('-m', '--max-words', type=int, default=10000000, help='Maximum number of words in the wordlist if not batched. (default: %(default)s)')
size_options.add_argument('-b', '--batch_size', type=int, default=1000000, help='Batch size for wordlist processing. (default: %(default)s)')
special_options = parser.add_argument_group('Special Options', 'Control the special characters and casing in words.')
special_options.add_argument('-u', '--uppercase', type=int, default=float('inf'), help='Maximum number of characters to convert to uppercase in each word. (default: %(default)s)')
special_options.add_argument('-l', '--leet', type=int, default=float('inf'), help='Maximum number of leet characters to replace in each word. (default: %(default)s)')
special_options.add_argument('-lm', '--leet-mapping', type=str, default=json.dumps({'o': '0', 'i': '1', 'l': '1', 'z': '2', 'e': '3', 'a': '4', 's': '5', 'g': '6', 't': '7', 'b': '8', 'g': '9'}),
help='JSON-formatted leet mapping dictionary. (default: %(default)s)')
special_options.add_argument('-d', '--deterministic-chars', type=int, default=1,
help='Number of deterministic characters to be added. (default: %(default)s)')
special_options.add_argument('-dc', '--deterministic-charset', type=str, default=r'''0123456789_!@$%#''',
help='Charset of deterministic characters to be added. (default: %(default)s)')
special_options.add_argument('-dp', '--deterministic-position', action='append', choices=['left', 'right', 'nested'], default=['left', 'right'],
help='Position for inserting deterministic characters. Can specify multiple options: left, right, nested. E.g., "-dp left -dp right" for both sides.')
special_options.add_argument('-r', '--random-chars', type=int, default=0, help='Maximum range of random characters to be added. (default: %(default)s)')
special_options.add_argument('-rc', '--random-charset', type=str, default=r'''0123456789!@$&+_-.?/+;#''', help='Charset of characters to be randomly added. (default: %(default)s)')
special_options.add_argument('-rl', '--random-level', type=int, default=1, help='Number of iterations of random characters to be added. (default: %(default)s)')
special_options.add_argument('-rw', '--random-weights', nargs=3, type=float, default=[0.47, 0.47, 0.06],
help='''Weights for determining position of random character insertion.
First value: Probability for inserting at the beginning.
Second value: Probability for inserting at the end.
Third value: Probability for inserting at a random position. (default: %(default)s)''')
other_options = parser.add_argument_group('Other Options')
other_options.add_argument('-k', '--key', type=str, help='OpenAI API Key. (default: %(default)s)', default=None)
other_options.add_argument('-o', '--output', type=str, default='wordlist.txt',help='Output file for the generated wordlist. (default: %(default)s)')
other_options.add_argument('-v', '--debug', action='store_true', default=False, help='If True, enable debug logging. (default: %(default)s)')
other_options.add_argument('-s', '--silent', action='store_true', default=False, help='If True, disable logging. (default: %(default)s)')
return parser.parse_args()
def main():
load_env()
args = parse_arguments()
set_logger(args)
openai_key = args.key or os.getenv("API_KEY")
if not validate_args(args, openai_key):
return
args_dict = vars(args).copy()
if openai_key:
args_dict['key'] = f"{openai_key[:3]}...{openai_key[-4:]}"
logging.debug(f"Arguments parsed: {args_dict}")
logging.info("Starting WordlistGPT...")
wordlist_generator = WordlistGenerator(args, openai_key)
wordlist_generator.orchestrate_threads()
wordlist_generator.save_wordlist()
def validate_args(args, openai_key):
if not openai_key:
logging.warning("API_KEY is not set in the environment variables. To generate more related words with GPT, set it in the .env file with API_KEY=YOUR API KEY or enter as an argument --key.")
if not args.words:
logging.error(
"No words provided. Use -w or --words argument followed by one or more words.")
return False
return True
def load_env(env_file=".env"):
if os.path.exists(env_file):
with open(env_file, 'r') as file:
for line in file:
key, value = line.strip().split('=', 1)
os.environ[key] = value
def set_logger(args):
logger = logging.getLogger()
if args.silent:
logger.setLevel(logging.CRITICAL)
elif args.debug:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setFormatter(CustomFormatter())
logger.addHandler(console_handler)
class CustomFormatter(logging.Formatter):
GREEN = '\033[1;32m'
YELLOW = '\033[1;33m'
RED = '\033[1;31m'
WHITE = '\033[1;37m'
RESET = '\033[0m'
LEVELS = {
logging.DEBUG: f"{WHITE}[~]{RESET}",
logging.INFO: f"{GREEN}[+]{RESET}",
logging.WARNING: f"{YELLOW}[!]{RESET}",
logging.ERROR: f"{RED}[x]{RESET}",
logging.CRITICAL: f"{RED}[X]{RESET}",
}
def format(self, record):
level = self.LEVELS.get(record.levelno, "")
message = f"{level} {record.getMessage()}"
if record.exc_info:
exc_text = self.formatException(record.exc_info)
message += f"\n{exc_text}"
return message
class WordlistGenerator:
def __init__(self, args, openai_key):
self.args = args
self.openai_key = openai_key
self.gpt_endpoint = "https://api.openai.com/v1/chat/completions"
self.leet_mapping = json.loads(args.leet_mapping)
self._wordlist = set()
self.batch_count = 0
self.estimated_words_number = len(args.words)*(args.number+1)
self.estimated_storage_size = self.human_size((self.args.min_size + self.args.max_size) * self.estimated_words_number / 2)
self.title_bar = "Progress:"
@property
def wordlist(self):
return sorted(list(self._wordlist))
@wordlist.setter
def wordlist(self, words):
if not isinstance(words, (set, list, tuple)):
words = {words}
self._wordlist.update(words)
if len(self._wordlist) > self.args.batch_size:
if not self.args.silent:
self.print_progress_bar(len(self._wordlist)+self.batch_count, self.estimated_words_number, self.estimated_storage_size, "Saving...")
self.save_wordlist()
self.batch_count += len(self._wordlist)
del self.wordlist
if not self.args.silent:
self.print_progress_bar(len(self._wordlist)+self.batch_count, self.estimated_words_number, self.estimated_storage_size)
@wordlist.deleter
def wordlist(self):
self._wordlist.clear()
def force_len(self, min_len, max_len):
self._wordlist -= {word for word in self._wordlist if not min_len <= len(word) <= max_len}
def words_from_string(self, words):
return {word.strip().rstrip('.').lower() for word in re.findall(r'''[\d\r\n-]*\.?\s?([\w \-\.'"]+)''', words)}
def split_subwords(self):
self.wordlist = {subword for word in self._wordlist for subword in re.split(r'\W+', word)}
def remove_non_words(self):
cleaned_wordlist = {re.sub(r'\W', '', word) for word in self._wordlist}
del self.wordlist
self.wordlist = cleaned_wordlist
def word_over_max_chars(self, word_len):
if word_len > self.args.max_size:
return True
return False
def wordlist_over_max_limit(self):
if len(self._wordlist) > self.args.max_words:
logging.warning(
f"Wordlist has reached the limit of {self.args.max_words} words. Stopping the word insertion.")
return True
return False
def orchestrate_threads(self):
start = perf_counter()
with ThreadPoolExecutor() as executor:
logging.info(f"Generating wordlist for {self.args.words}")
executor.map(self.words_from_gpt, self.args.words, [self.args.number]*len(self.args.words))
self.generate_wordlist()
if not self.args.silent:
self.print_progress_bar(len(self._wordlist), len(self._wordlist), self.estimated_storage_size, "Completed")
print("\r" + " " * 80 + "\r", end='')
logging.info(f"A total of {len(self._wordlist)+self.batch_count} words have been saved in {self.args.output}")
logging.info(f"Elapsed time: {round(perf_counter()-start, 2)} seconds.")
def words_from_gpt(self, word, num_words):
self.wordlist = word
if num_words > 1 and self.openai_key:
content = f"You are a word generator tool that generates {num_words} words related to the theme {word}. Each word must have a minimum of {self.args.min_size} and a maximum of {self.args.max_size} characters."
message = [{"role": "system", "content": content}]
headers = {
"Authorization": f"Bearer {self.openai_key}",
"Content-Type": "application/json"
}
data = {
"model": "gpt-3.5-turbo",
"messages": message,
"max_tokens": 4096 - len(content)
}
response = requests.post(self.gpt_endpoint, headers=headers, json=data)
if response.ok:
response_data = response.json()
generated_words_from_gpt = self.words_from_string(response_data['choices'][0]['message']['content'])
self.wordlist = generated_words_from_gpt
logging.info(f"Words generated from GPT based on the word {word}: {generated_words_from_gpt}")
else:
logging.error(f"API call failed with status code {response.status_code}, Error: {response.text}")
def generate_wordlist(self):
try:
self.split_subwords()
self.remove_non_words()
self.estimate_words()
if not self.wordlist_over_max_limit() and self.args.uppercase > 0:
self.add_uppercase_variations()
if not self.wordlist_over_max_limit() and self.args.leet > 0:
self.add_leet_variations()
if not self.wordlist_over_max_limit() and self.args.deterministic_chars > 0:
self.insert_deterministic_chars()
if not self.wordlist_over_max_limit() and self.args.random_chars > 0:
self.insert_random_chars()
self.force_len(self.args.min_size, self.args.max_size)
except Exception:
logging.error("An error occurred during wordlist generation", exc_info=True)
def add_uppercase_variations(self):
for word in self.wordlist:
limited_uppercase_wordlist = set()
if self.wordlist_over_max_limit():
return
for combination in product(*[(ch.lower(), ch.upper()) for ch in word]):
if sum(1 for c in combination if c.isupper()) <= self.args.uppercase:
limited_uppercase_wordlist.add(''.join(combination))
self.wordlist = limited_uppercase_wordlist
def add_leet_variations(self):
for word in self.wordlist:
if self.wordlist_over_max_limit():
return
char_options_list = []
leet_count = 0
for ch in word:
options = [ch]
leet_equiv = self.leet_mapping.get(ch)
if leet_equiv and leet_count < self.args.leet:
options.append(leet_equiv)
leet_count += 1
char_options_list.append(options)
self.wordlist = {''.join(combination) for combination in product(*char_options_list)}
def insert_random_chars(self):
new_words = set()
for word in self.wordlist:
if self.wordlist_over_max_limit():
return
for _ in range(self.args.random_level):
new_word = word
num_chars = random.randint(0, self.args.random_chars)
if self.word_over_max_chars(len(new_word) + num_chars):
continue
for _ in range(num_chars):
char = random.choice(self.args.random_charset)
position = random.choices(
[0, len(new_word), random.randint(1, len(new_word)-1)],
weights=self.args.random_weights
)[0]
new_word = new_word[:position] + char + new_word[position:]
new_words.add(new_word)
if len(new_words) > 10000:
self.wordlist = new_words
new_words.clear()
self.wordlist = new_words
def insert_deterministic_chars(self):
new_words = set()
combinations = []
for num_chars in range(1, max(1,self.args.deterministic_chars) + 1):
combinations.extend([''.join(x) for x in product(self.args.deterministic_charset, repeat=num_chars)])
for word in self.wordlist:
for combination in combinations:
if 'right' in self.args.deterministic_position:
new_words.add(word + combination)
if 'left' in self.args.deterministic_position:
new_words.add(combination + word)
if 'nested' in self.args.deterministic_position:
for nested_comb in combinations:
new_words.add(nested_comb + word + combination)
new_words.add(combination + word + nested_comb)
if len(new_words) > 10000:
self.wordlist = new_words
new_words.clear()
if len(new_words) > 10000:
self.wordlist = new_words
new_words.clear()
self.wordlist = new_words
def estimate_words(self):
total = 0
for word in self._wordlist:
if not word:
continue
possibilities_for_each_char = []
for ch in word:
possibilities = {ch.lower(), ch.upper()}
leet_equiv = self.leet_mapping.get(ch.lower())
if leet_equiv:
possibilities.add(leet_equiv)
possibilities_for_each_char.append(len(possibilities))
total += reduce(lambda x, y: x * y, possibilities_for_each_char)
if self.args.deterministic_chars:
deterministic_combinations = len(self.args.deterministic_charset) ** self.args.deterministic_chars
if 'nested' in self.args.deterministic_position:
total *= 2 + deterministic_combinations ** 2
else:
total *= 2 * deterministic_combinations
if self.args.random_chars:
total *= 1 + self.args.random_level*0.9
self.estimated_words_number = int(total)
self.estimated_storage_size = self.human_size((self.args.min_size + self.args.max_size) * self.estimated_words_number / 2)
def human_size(self, size, units=[' B', ' KB', ' MB', ' GB', ' TB', ' PB', ' EB']):
if size < 1024:
return f"{size:.2f}{units[0]}"
else:
return self.human_size(size / 1024, units[1:])
@staticmethod
def print_progress_bar(iteration, total, bytes_size, title="Progress:", bar_length=20):
max_total = max(1, iteration, total)
percentage = (iteration / max_total) * 100
block = int(round(bar_length * iteration / max_total))
text = f"\033[1;32m[+]\033[0m {title} [{'#' * block}{'-' * (bar_length - block)}] {round(percentage, 2)}% ({iteration}/{max_total}) ~ {bytes_size}\033[K"
print(text, end='\r' , flush=True)
def save_wordlist(self):
self.force_len(self.args.min_size, self.args.max_size)
with open(self.args.output, 'a') as file:
for word in self.wordlist:
file.write(f"{word}\n")
if __name__ == '__main__':
main()