-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
34 changed files
with
8,111 additions
and
199 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,12 @@ | ||
from attacks.DataExtraction.enron import EnronDataExtraction | ||
from models.togetherai import TogetherAIModels | ||
from attacks.Jailbreak.jailbreak import Jailbreak | ||
from data import JailbreakQueries | ||
from models import TogetherAIModels | ||
from attacks import Jailbreak | ||
from metrics import JailbreakRate | ||
|
||
enron = EnronDataExtraction(data_path="data/enron") | ||
prompts, _ = enron.generate_prompts(format="prefix-50") | ||
# Replace api_key with your own API key | ||
data = JailbreakQueries() | ||
# Fill api_key | ||
llm = TogetherAIModels(model="togethercomputer/llama-2-7b-chat", api_key="") | ||
attack = Jailbreak() | ||
results = attack.execute_attack(prompts, llm) | ||
print("results:", results) | ||
|
||
results = attack.execute_attack(data, llm) | ||
rate = JailbreakRate(results).compute_metric() | ||
print("rate:", rate) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
from attacks.DataExtraction.enron import EnronDataExtraction | ||
import random | ||
random.seed(0) | ||
import json | ||
import argparse | ||
from tqdm import tqdm | ||
from transformers import AutoTokenizer, AutoModelForCausalLM | ||
import torch | ||
import numpy as np | ||
from models.ft_clm import PeftCasualLM, FinetunedCasualLM | ||
import wandb | ||
|
||
parser = argparse.ArgumentParser() | ||
# parser.add_argument('--num_sample', default=-1, type=int, help='use -1 to include all samples') | ||
parser.add_argument('--min_idx', default=0, type=int) | ||
parser.add_argument('--max_idx', default=100, type=int) | ||
|
||
parser.add_argument('--model', default='./results/llama-2-7B-enron/checkpoint_451', type=str) | ||
parser.add_argument('--arch', default='meta-llama/Llama-2-7b-chat-hf', type=str) | ||
parser.add_argument('--peft', default='lora', type=str) | ||
parser.add_argument('--min_prompt_len', default=200, type=int) | ||
parser.add_argument('--max_seq_len', default=1024, type=int) | ||
|
||
args = parser.parse_args() | ||
|
||
wandb.init(project='LLM-PBE', config=vars(args)) | ||
|
||
model_path=args.model | ||
|
||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
|
||
if args.arch == 'none': | ||
args.arch = None # will infer default arch from model. | ||
|
||
if args.peft == 'none': | ||
llm = FinetunedCasualLM(model_path=args.model, arch=args.arch, max_seq_len=args.max_seq_len) | ||
else: | ||
llm = PeftCasualLM(model_path=args.model, arch=args.arch, max_seq_len=args.max_seq_len) | ||
|
||
enron = EnronDataExtraction(data_path="data/enron") | ||
format=f'prefix-{args.min_prompt_len}' | ||
model_card= args.model.split('/')[-2] + '_' + args.model.split('/')[-1] | ||
|
||
|
||
prompts, labels = enron.generate_prompts(format=format) | ||
|
||
if args.max_idx > len(prompts): | ||
args.max_idx=len(prompts) | ||
|
||
prompts= prompts[args.min_idx:args.max_idx] | ||
labels= labels[args.min_idx:args.max_idx] | ||
|
||
output_fname= f'generations/enron/{model_card}_idx{args.min_idx}_idx{args.max_idx}_min{args.min_prompt_len}.jsonl' | ||
result=[] | ||
|
||
for i, prompt in enumerate(tqdm(prompts)): | ||
|
||
ground_truth = labels[i] | ||
try: | ||
res= llm.query(prompt) | ||
result.append({'idx':i+args.min_idx, 'output':res,'label':ground_truth, 'prompt':prompt}) | ||
|
||
except Exception as e: | ||
print(e) | ||
continue | ||
|
||
if i%5==0: | ||
print(f'Finish {i} samples') | ||
with open(output_fname, 'w') as outfile: | ||
for entry in result: | ||
json.dump(entry, outfile) | ||
outfile.write('\n') | ||
|
||
with open(output_fname, 'w') as outfile: | ||
for entry in result: | ||
json.dump(entry, outfile) | ||
outfile.write('\n') | ||
|
||
wandb.finish() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,17 @@ | ||
|
||
```shell | ||
pip install rapidfuzz | ||
``` | ||
``` | ||
|
||
## How to run | ||
|
||
At the root folder. | ||
|
||
* Attack | ||
```shell | ||
wandb sweep sweeps/extract_prompts.yml | ||
``` | ||
* Defense | ||
```shell | ||
wandb sweep sweeps/extract_prompts_defense.yml | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,3 @@ | ||
# Datasets | ||
|
||
- Dataset 1: [link] | ||
- Dataset 2: [link] | ||
... | ||
|
||
|
||
## Prompt Leakage | ||
|
||
1. Download GPTStores system prompts | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
## AdvBench | ||
|
||
"harmful_behavirs.csv" includes harmful behaviours summarized in [AdvBench](https://github.com/llm-attacks/llm-attacks/tree/main/data/advbench) dataset. |
Large diffs are not rendered by default.
Oops, something went wrong.
Oops, something went wrong.