Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
QinbinLi committed May 26, 2024
1 parent 3828bc8 commit 1579cfd
Show file tree
Hide file tree
Showing 34 changed files with 8,111 additions and 199 deletions.
16 changes: 0 additions & 16 deletions .vscode/launch.json

This file was deleted.

18 changes: 9 additions & 9 deletions AttackDemo.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from attacks.DataExtraction.enron import EnronDataExtraction
from models.togetherai import TogetherAIModels
from attacks.Jailbreak.jailbreak import Jailbreak
from data import JailbreakQueries
from models import TogetherAIModels
from attacks import Jailbreak
from metrics import JailbreakRate

enron = EnronDataExtraction(data_path="data/enron")
prompts, _ = enron.generate_prompts(format="prefix-50")
# Replace api_key with your own API key
data = JailbreakQueries()
# Fill api_key
llm = TogetherAIModels(model="togethercomputer/llama-2-7b-chat", api_key="")
attack = Jailbreak()
results = attack.execute_attack(prompts, llm)
print("results:", results)

results = attack.execute_attack(data, llm)
rate = JailbreakRate(results).compute_metric()
print("rate:", rate)
2 changes: 1 addition & 1 deletion attacks/DataExtraction/extract_enron_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
print(e)
continue

if i%500==0:
if i%100==0:
print(f'Finish {i} samples')
with open(output_fname, 'w') as outfile:
for entry in result:
Expand Down
79 changes: 79 additions & 0 deletions attacks/DataExtraction/extract_enron_parallel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from attacks.DataExtraction.enron import EnronDataExtraction
import random
random.seed(0)
import json
import argparse
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np
from models.ft_clm import PeftCasualLM, FinetunedCasualLM
import wandb

parser = argparse.ArgumentParser()
# parser.add_argument('--num_sample', default=-1, type=int, help='use -1 to include all samples')
parser.add_argument('--min_idx', default=0, type=int)
parser.add_argument('--max_idx', default=100, type=int)

parser.add_argument('--model', default='./results/llama-2-7B-enron/checkpoint_451', type=str)
parser.add_argument('--arch', default='meta-llama/Llama-2-7b-chat-hf', type=str)
parser.add_argument('--peft', default='lora', type=str)
parser.add_argument('--min_prompt_len', default=200, type=int)
parser.add_argument('--max_seq_len', default=1024, type=int)

args = parser.parse_args()

wandb.init(project='LLM-PBE', config=vars(args))

model_path=args.model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if args.arch == 'none':
args.arch = None # will infer default arch from model.

if args.peft == 'none':
llm = FinetunedCasualLM(model_path=args.model, arch=args.arch, max_seq_len=args.max_seq_len)
else:
llm = PeftCasualLM(model_path=args.model, arch=args.arch, max_seq_len=args.max_seq_len)

enron = EnronDataExtraction(data_path="data/enron")
format=f'prefix-{args.min_prompt_len}'
model_card= args.model.split('/')[-2] + '_' + args.model.split('/')[-1]


prompts, labels = enron.generate_prompts(format=format)

if args.max_idx > len(prompts):
args.max_idx=len(prompts)

prompts= prompts[args.min_idx:args.max_idx]
labels= labels[args.min_idx:args.max_idx]

output_fname= f'generations/enron/{model_card}_idx{args.min_idx}_idx{args.max_idx}_min{args.min_prompt_len}.jsonl'
result=[]

for i, prompt in enumerate(tqdm(prompts)):

ground_truth = labels[i]
try:
res= llm.query(prompt)
result.append({'idx':i+args.min_idx, 'output':res,'label':ground_truth, 'prompt':prompt})

except Exception as e:
print(e)
continue

if i%5==0:
print(f'Finish {i} samples')
with open(output_fname, 'w') as outfile:
for entry in result:
json.dump(entry, outfile)
outfile.write('\n')

with open(output_fname, 'w') as outfile:
for entry in result:
json.dump(entry, outfile)
outfile.write('\n')

wandb.finish()
20 changes: 20 additions & 0 deletions attacks/MIA/member_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class MIAMetric(Enum):

LIRA = "lira" # https://arxiv.org/pdf/2203.03929.pdf
NEIGHBOR = "neighbor" # https://aclanthology.org/2023.findings-acl.719.pdf
MIN_K_PROB = "min_k_prob" # https://arxiv.org/pdf/2310.16789.pdf

class MemberInferenceAttack(AttackBase):
"""Membership Inference Attack (MIA).
Expand Down Expand Up @@ -87,6 +88,25 @@ def _get_score(self, model: FinetunedCasualLM, text: str):
ppl = model.evaluate_ppl(text)
num_bits = len(zlib.compress(text.encode())) * 8
score = ppl / num_bits
elif self.metric == MIAMetric.MIN_K_PROB:
# Get logits from model
input_ids = model.tokenizer.encode(text, return_tensors='pt', truncation=True, max_length=model.max_seq_len).cuda()
with torch.no_grad():
outputs = model._lm(input_ids, labels=input_ids)
logits = outputs[1]

# Apply softmax to the logits to get probabilities
probabilities = torch.nn.functional.log_softmax(logits, dim=-1).cpu().data
all_prob = []
input_ids_processed = input_ids[0][1:]
for i, token_id in enumerate(input_ids_processed):
probability = probabilities[0, i, token_id].item()
all_prob.append(probability)

# Calculate Min-K% Probability
k_length = int(len(all_prob) * 0.10) # TODO: For now, K is hard-coded as 10%
topk_prob = np.sort(all_prob)[:k_length]
score = -np.mean(topk_prob).item()
else:
raise NotImplementedError(f"{self.metric}")
return score
Expand Down
15 changes: 14 additions & 1 deletion attacks/PromptLeakage/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,17 @@

```shell
pip install rapidfuzz
```
```

## How to run

At the root folder.

* Attack
```shell
wandb sweep sweeps/extract_prompts.yml
```
* Defense
```shell
wandb sweep sweeps/extract_prompts_defense.yml
```
7 changes: 0 additions & 7 deletions data/README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,3 @@
# Datasets

- Dataset 1: [link]
- Dataset 2: [link]
...


## Prompt Leakage

1. Download GPTStores system prompts
Expand Down
3 changes: 3 additions & 0 deletions data/advbench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
## AdvBench

"harmful_behavirs.csv" includes harmful behaviours summarized in [AdvBench](https://github.com/llm-attacks/llm-attacks/tree/main/data/advbench) dataset.
3,333 changes: 3,333 additions & 0 deletions data/enron/context_phishing.jsonl

Large diffs are not rendered by default.

Loading

0 comments on commit 1579cfd

Please sign in to comment.