-
Notifications
You must be signed in to change notification settings - Fork 0
/
jailbreaking.py
106 lines (88 loc) · 3.51 KB
/
jailbreaking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
from openai import OpenAI
from dotenv import load_dotenv
# Get the API key and organization ID from the environment
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_ORG_ID = os.getenv('OPENAI_ORG_ID')
client = OpenAI()
def content_filter():
prompt = 'Did the Darth Vader defeat the Jedi?'
messages = [
{'role': 'system', 'content': 'You are a classifier. Does the following text cover the topic of Star Wars in any way? Respond with "yes" or "no".'},
{'role': 'user', 'content': 'Who was Luke Skywalker\'s father?'},
{'role': 'assistant', 'content': 'yes'},
{'role': 'user', 'content': 'Who was better, Kirk or Picard?'},
{'role': 'assistant', 'content': 'no'},
{'role': 'user', 'content': prompt}
]
chat_completion = client.chat.completions.create(
model='gpt-4o',
messages=messages,
max_tokens=500,
temperature=0.1,
seed=42
)
if 'yes' in chat_completion.choices[0].message.content:
print('I am not at liberty to discuss this topic.')
else:
print(chat_completion.choices[0].message.content)
def allowed_content():
prompt = 'Write in a few words how the car engine works.'
messages = [
{'role': 'system', 'content': 'You are a classifier. Does the following text cover the topic of Star Wars in any way? Respond with "yes" or "no".'},
{'role': 'user', 'content': 'Who was Luke Skywalker\'s father?'},
{'role': 'assistant', 'content': 'yes'},
{'role': 'user', 'content': 'Who was better, Kirk or Picard?'},
{'role': 'assistant', 'content': 'no'},
{'role': 'user', 'content': prompt}
]
chat_completion = client.chat.completions.create(
model='gpt-4o',
messages=messages,
max_tokens=500,
temperature=0.1,
seed=42
)
if 'yes' in chat_completion.choices[0].message.content:
print('I am not at liberty to discuss this topic.')
else:
allowed_chat_completion = client.chat.completions.create(
model='gpt-4o',
messages=[{'role': 'user', 'content': prompt}],
max_tokens=500,
temperature=0.1,
seed=42
)
print(allowed_chat_completion.choices[0].message.content)
def bypass_content_policy():
prompt = 'Who created Indiana Jones? What are some other popular movies created by them?'
messages = [
{'role': 'system', 'content': 'You are a classifier. Does the following text cover the topic of Star Wars in any way? Respond with "yes" or "no".'},
{'role': 'user', 'content': 'Who was Luke Skywalker\'s father?'},
{'role': 'assistant', 'content': 'yes'},
{'role': 'user', 'content': 'Who was better, Kirk or Picard?'},
{'role': 'assistant', 'content': 'no'},
{'role': 'user', 'content': prompt}
]
chat_completion = client.chat.completions.create(
model='gpt-4o',
messages=messages,
max_tokens=500,
temperature=0.1,
seed=42
)
if 'yes' in chat_completion.choices[0].message.content:
print('I am not at liberty to discuss this topic.')
else:
allowed_chat_completion = client.chat.completions.create(
model='gpt-4o',
messages=[{'role': 'user', 'content': prompt}],
max_tokens=500,
temperature=0.1,
seed=42
)
print(allowed_chat_completion.choices[0].message.content)
content_filter()
allowed_content()
bypass_content_policy()