import promptbench as pb
from promptbench.models import LLMModel
from promptbench.prompt_attack import Attack
  from .autonotebook import tqdm as notebook_tqdm
# create model
model_t5 = LLMModel(model='google/flan-t5-large')

# create dataset
dataset = pb.DatasetLoader.load_dataset("sst2")

# create prompt
prompt = "As a sentiment classifier, determine whether the following text is 'positive' or 'negative'. Please classify: \nQuestion: {content}\nAnswer:"

# define the input process function, output process function, and evaluation function
input_process_func = pb.InputProcess.basic_format
output_process_func = pb.OutputProcess.cls
eval_func = pb.Eval.compute_cls_accuracy

# define the unmodifiable words in the prompt
# for example, "positive" and "negative" are unmodifiable, and "content" is modifiable because it is a placeholder
unmodifiable_words = ["positive", "negative", "content"]

# create attack, specify the model, dataset, prompt, input process function, output process function, evaluation function, and unmodifiable words
attack = Attack(model_t5, "stresstest", dataset, prompt, input_process_func, output_process_func, eval_func, unmodifiable_words)

# print attack result
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

These words (if they appear in the prompt) are not allowed to be attacked:
['positive', 'negative', 'content']

(0.0, "As a sentiment classifier, determine whether the following text is 'positive' or 'negative'. Please classify: \nQuestion: {content}\nAnswer  and false is not true :", 0.0, 0.0)