lada-kunc-moveo commited on
Commit
723dcac
·
1 Parent(s): 287d3de

feat: Prompt-Guard simple app

Browse files
Files changed (2) hide show
  1. app.py +239 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import time
3
+
4
+ import torch
5
+ from torch.nn.functional import softmax
6
+
7
+ from transformers import (
8
+ AutoModelForSequenceClassification,
9
+ AutoTokenizer,
10
+ )
11
+
12
+ """
13
+ Utilities for loading the PromptGuard model and evaluating text for jailbreaks and indirect injections.
14
+
15
+ Note that the underlying model has a maximum recommended input size of 512 tokens as a DeBERTa model.
16
+ The final two functions in this file implement efficient parallel batched evaluation of the model on a list
17
+ of input strings of arbirary length, with the final score for each input being the maximum score across all
18
+ chunks of the input string.
19
+ """
20
+
21
+
22
+ def load_model_and_tokenizer(model_name='meta-llama/Prompt-Guard-86M'):
23
+ """
24
+ Load the PromptGuard model from Hugging Face or a local model.
25
+
26
+ Args:
27
+ model_name (str): The name of the model to load. Default is 'meta-llama/Prompt-Guard-86M'.
28
+
29
+ Returns:
30
+ transformers.PreTrainedModel: The loaded model.
31
+ """
32
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
33
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
34
+ return model, tokenizer
35
+
36
+
37
+ def preprocess_text_for_promptguard(text: str, tokenizer) -> str:
38
+ """
39
+ Preprocess the text by removing spaces that break apart larger tokens.
40
+ This hotfixes a workaround to PromptGuard, where spaces can be inserted into a string
41
+ to allow the string to be classified as benign.
42
+
43
+ Args:
44
+ text (str): The input text to preprocess.
45
+ tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
46
+
47
+ Returns:
48
+ str: The preprocessed text.
49
+ """
50
+
51
+ try:
52
+ cleaned_text = ''
53
+ index_map = []
54
+ for i, char in enumerate(text):
55
+ if not char.isspace():
56
+ cleaned_text += char
57
+ index_map.append(i)
58
+ tokens = tokenizer.tokenize(cleaned_text)
59
+ result = []
60
+ last_end = 0
61
+ for token in tokens:
62
+ token_str = tokenizer.convert_tokens_to_string([token])
63
+ start = cleaned_text.index(token_str, last_end)
64
+ end = start + len(token_str)
65
+ original_start = index_map[start]
66
+ if original_start > 0 and text[original_start - 1].isspace():
67
+ result.append(' ')
68
+ result.append(token_str)
69
+ last_end = end
70
+ return ''.join(result)
71
+ except Exception:
72
+ return text
73
+
74
+
75
+ def get_class_probabilities(model, tokenizer, text, temperature=1.0, device='cpu', preprocess=True):
76
+ """
77
+ Evaluate the model on the given text with temperature-adjusted softmax.
78
+ Note, as this is a DeBERTa model, the input text should have a maximum length of 512.
79
+
80
+ Args:
81
+ text (str): The input text to classify.
82
+ temperature (float): The temperature for the softmax function. Default is 1.0.
83
+ device (str): The device to evaluate the model on.
84
+
85
+ Returns:
86
+ torch.Tensor: The probability of each class adjusted by the temperature.
87
+ """
88
+ if preprocess:
89
+ text = preprocess_text_for_promptguard(text, tokenizer)
90
+ # Encode the text
91
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
92
+ inputs = inputs.to(device)
93
+ # Get logits from the model
94
+ with torch.no_grad():
95
+ logits = model(**inputs).logits
96
+ # Apply temperature scaling
97
+ scaled_logits = logits / temperature
98
+ # Apply softmax to get probabilities
99
+ probabilities = softmax(scaled_logits, dim=-1)
100
+ return probabilities
101
+
102
+
103
+ def get_jailbreak_score(model, tokenizer, text, temperature=1.0, device='cpu', preprocess=True):
104
+ """
105
+ Evaluate the probability that a given string contains malicious jailbreak or prompt injection.
106
+ Appropriate for filtering dialogue between a user and an LLM.
107
+
108
+ Args:
109
+ text (str): The input text to evaluate.
110
+ temperature (float): The temperature for the softmax function. Default is 1.0.
111
+ device (str): The device to evaluate the model on.
112
+
113
+ Returns:
114
+ float: The probability of the text containing malicious content.
115
+ """
116
+ probabilities = get_class_probabilities(model, tokenizer, text, temperature, device, preprocess)
117
+ return probabilities[0, 2].item()
118
+
119
+
120
+ def get_indirect_injection_score(model, tokenizer, text, temperature=1.0, device='cpu', preprocess=True):
121
+ """
122
+ Evaluate the probability that a given string contains any embedded instructions (malicious or benign).
123
+ Appropriate for filtering third party inputs (e.g. web searches, tool outputs) into an LLM.
124
+
125
+ Args:
126
+ text (str): The input text to evaluate.
127
+ temperature (float): The temperature for the softmax function. Default is 1.0.
128
+ device (str): The device to evaluate the model on.
129
+
130
+ Returns:
131
+ float: The combined probability of the text containing malicious or embedded instructions.
132
+ """
133
+ probabilities = get_class_probabilities(model, tokenizer, text, temperature, device, preprocess)
134
+ return (probabilities[0, 1] + probabilities[0, 2]).item()
135
+
136
+
137
+ def process_text_batch(model, tokenizer, texts, temperature=1.0, device='cpu', preprocess=True):
138
+ """
139
+ Process a batch of texts and return their class probabilities.
140
+ Args:
141
+ model (transformers.PreTrainedModel): The loaded model.
142
+ tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
143
+ texts (list[str]): A list of texts to process.
144
+ temperature (float): The temperature for the softmax function.
145
+ device (str): The device to evaluate the model on.
146
+
147
+ Returns:
148
+ torch.Tensor: A tensor containing the class probabilities for each text in the batch.
149
+ """
150
+ if preprocess:
151
+ texts = [preprocess_text_for_promptguard(text, tokenizer) for text in texts]
152
+ inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
153
+ inputs = inputs.to(device)
154
+ with torch.no_grad():
155
+ logits = model(**inputs).logits
156
+ scaled_logits = logits / temperature
157
+ probabilities = softmax(scaled_logits, dim=-1)
158
+ return probabilities
159
+
160
+
161
+ def get_scores_for_texts(model, tokenizer, texts, score_indices, temperature=1.0, device='cpu', max_batch_size=16, preprocess=True):
162
+ """
163
+ Compute scores for a list of texts, handling texts of arbitrary length by breaking them into chunks and processing in parallel.
164
+ Args:
165
+ model (transformers.PreTrainedModel): The loaded model.
166
+ tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
167
+ texts (list[str]): A list of texts to evaluate.
168
+ score_indices (list[int]): Indices of scores to sum for final score calculation.
169
+ temperature (float): The temperature for the softmax function.
170
+ device (str): The device to evaluate the model on.
171
+ max_batch_size (int): The maximum number of text chunks to process in a single batch.
172
+
173
+ Returns:
174
+ list[float]: A list of scores for each text.
175
+ """
176
+ all_chunks = []
177
+ text_indices = []
178
+ for index, text in enumerate(texts):
179
+ chunks = [text[i:i+512] for i in range(0, len(text), 512)]
180
+ all_chunks.extend(chunks)
181
+ text_indices.extend([index] * len(chunks))
182
+ all_scores = [0] * len(texts)
183
+ for i in range(0, len(all_chunks), max_batch_size):
184
+ batch_chunks = all_chunks[i:i+max_batch_size]
185
+ batch_indices = text_indices[i:i+max_batch_size]
186
+ probabilities = process_text_batch(model, tokenizer, batch_chunks, temperature, device, preprocess)
187
+ scores = probabilities[:, score_indices].sum(dim=1).tolist()
188
+
189
+ for idx, score in zip(batch_indices, scores):
190
+ all_scores[idx] = max(all_scores[idx], score)
191
+ return all_scores
192
+
193
+
194
+ def get_jailbreak_scores_for_texts(model, tokenizer, texts, temperature=1.0, device='cpu', max_batch_size=16, preprocess=True):
195
+ """
196
+ Compute jailbreak scores for a list of texts.
197
+ Args:
198
+ model (transformers.PreTrainedModel): The loaded model.
199
+ tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
200
+ texts (list[str]): A list of texts to evaluate.
201
+ temperature (float): The temperature for the softmax function.
202
+ device (str): The device to evaluate the model on.
203
+ max_batch_size (int): The maximum number of text chunks to process in a single batch.
204
+
205
+ Returns:
206
+ list[float]: A list of jailbreak scores for each text.
207
+ """
208
+ return get_scores_for_texts(model, tokenizer, texts, [2], temperature, device, max_batch_size, preprocess)
209
+
210
+
211
+ def get_indirect_injection_scores_for_texts(model, tokenizer, texts, temperature=1.0, device='cpu', max_batch_size=16, preprocess=True):
212
+ """
213
+ Compute indirect injection scores for a list of texts.
214
+ Args:
215
+ model (transformers.PreTrainedModel): The loaded model.
216
+ tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the model.
217
+ texts (list[str]): A list of texts to evaluate.
218
+ temperature (float): The temperature for the softmax function.
219
+ device (str): The device to evaluate the model on.
220
+ max_batch_size (int): The maximum number of text chunks to process in a single batch.
221
+
222
+ Returns:
223
+ list[float]: A list of indirect injection scores for each text.
224
+ """
225
+ return get_scores_for_texts(model, tokenizer, texts, [1, 2], temperature, device, max_batch_size, preprocess)
226
+
227
+ model, tokenizer = load_model_and_tokenizer()
228
+
229
+ async def process(text):
230
+ start_time = time.monotonic()
231
+ probabilities = get_class_probabilities(model, tokenizer, text, device='cpu', preprocess=True)
232
+ jailbreak_score = probabilities[0, 2].item()
233
+ injection_score = (probabilities[0, 1] + probabilities[0, 2]).item()
234
+ end_time = time.monotonic()
235
+ return {"jailbreak": jailbreak_score, "injection": injection_score, "duration": f"{end_time-start_time:.3f}"}
236
+
237
+ demo = gr.Interface(fn=process, inputs="text", outputs="json")
238
+
239
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ torch
2
+ transformers