1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
| import gc import torch import argparse import pandas as pd
from tqdm import tqdm from peft import PeftModel from types import SimpleNamespace from transformers import AutoTokenizer, AutoModelForCausalLM
torch.backends.cuda.enable_flash_sdp(False) torch.backends.cuda.enable_mem_efficient_sdp(False)
def main(args): config = SimpleNamespace( data_dir = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2', )
model = AutoModelForCausalLM.from_pretrained( args.model_pth, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, ) model = PeftModel.from_pretrained(model, args.lora_pth) tokenizer = AutoTokenizer.from_pretrained(args.model_pth, padding_side='right') tokenizer.pad_token = tokenizer.eos_token def preprocess(sample, text=False, infer_mode=False, max_seq=args.max_length, return_tensors=None): sys_prompt = "Please read the following essay and assign a score of 1,2,3,4,5,6 where 6 is the best. Output only a single number with no explanation.\n\n" prompt = sample["full_text"] if infer_mode: answer = "" else: answer = str(sample["score"])
messages = [ {"role": "user", "content": sys_prompt + prompt}, {"role": "assistant", "content": f"\n\nThe score is: " + answer} ] formatted_sample = tokenizer.apply_chat_template(messages, tokenize=False) if infer_mode: formatted_sample = formatted_sample.replace("<|eot_id|>","")
tokenized_sample = tokenizer(formatted_sample, padding=True, return_tensors=return_tensors, truncation=True, add_special_tokens=False, max_length=max_seq)
if return_tensors=="pt": tokenized_sample["labels"] = tokenized_sample["input_ids"].clone() else: tokenized_sample["labels"] = tokenized_sample["input_ids"].copy()
if text: return formatted_sample else: return tokenized_sample df_test = pd.read_csv(f'{config.data_dir}/test.csv') sub = pd.read_csv(f'{config.data_dir}/sample_submission.csv') test_preds = []
for i,row in tqdm(df_test.iterrows(), total=len(df_test)):
tokenized_sample = preprocess(row, infer_mode=True, max_seq=args.max_length, return_tensors="pt") generated_ids = model.generate(**tokenized_sample.to('cuda'), max_new_tokens=2, pad_token_id=tokenizer.eos_token_id, do_sample=False) decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
try: answer = decoded[0].rsplit("The score is: ", 1)[1] test_preds.append( int(answer) ) except: test_preds.append( 3 ) sub.score = test_preds sub.score = sub.score.astype('int') sub.to_csv(args.sub_pth, index=False) del model, tokenizer torch.cuda.empty_cache(); gc.collect() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model_pth", type=str, required=True, help="Path to the pretrained model" ) parser.add_argument("--lora_pth", type=str, required=True, help="Path to the PEFT LoRA adapter") parser.add_argument("--sub_pth", type=str, required=True, help="Path to save submission file" ) parser.add_argument("--max_length", type=int, required=True, help="Max length of input sequence" ) args = parser.parse_args()
main(args)
|