Kaggle-论文打分系统(bert,llama3思路)

kaggle上还在开着的competition, 第一感觉是bert加下游监督学习就结束了, 但是后面看到有人用llama3加prompt加微调的思路来处理, 感觉很有意思, 记录一下. 数据大概如下

首先任务类型: 给定text, 需要预测对应score, 考虑到score本身numerical, 加上官方的给定evaluation也是square(y_pred-y_true) 这个级别, 所以可以考虑是下游任务为regression的文本处理问题.

bert

指定path并查看head

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))

train_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv'
test_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv'
submission = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv'

train_data = pd.read_csv(train_path)[['full_text', 'score']]
test_data = pd.read_csv(test_path)[['full_text']]
train_data.head()

login wandb

1
2
3
4
5
6
!pip install transformers datasets evaluate accelerate wandb
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wandb_token = user_secrets.get_secret("wandb_token")
! wandb login $wandb_token

preprocessing

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')

def pre_processing(example):
return tokenizer(example['text'], truncation=True, padding="max_length", max_length=512)



train_data = train_data.rename(columns={'full_text':'text'})
train_data = train_data.rename(columns={'score':'label'})
train_data['label'] = train_data['label'].astype(float)
test_data = test_data.rename(columns={'full_text':'text'})

train_df, val_df = train_test_split(train_data, test_size=.2)

train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_data)


tokenized_train = train_dataset.map(pre_processing, batched=True)
tokenized_valid = valid_dataset.map(pre_processing, batched=True)
tokenized_test = test_dataset.map(pre_processing, batched=True)

指定evaluation

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import torch

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def compute_metrics_for_regression(eval_pred):
logits, labels = eval_pred
labels = labels.reshape(-1)
mse = mean_squared_error(labels, logits)
mae = mean_absolute_error(labels, logits)
r2 = r2_score(labels, logits)
single_squared_errors = ((logits - labels).flatten()**2).tolist()

# Compute accuracy
# Based on the fact that the rounded score = true score only if |single_squared_errors| < 0.5
accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)

return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}

train

注意num_labels = 1, bert里面做好了处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding

model = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased', num_labels=1)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
output_dir='./essay_score',
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
weight_decay=0.01,
evaluation_strategy='epoch',
save_strategy='epoch',
load_best_model_at_end=True
)


import torch

class RegressionTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.pop("labels")
outputs = model(**inputs)
logits = outputs[0][:, 0]
loss = torch.nn.functional.mse_loss(logits, labels)
return (loss, outputs) if return_outputs else loss


trainer = RegressionTrainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_valid,
data_collator=data_collator,
compute_metrics = compute_metrics_for_regression,
)

trainer.train()

简单test

可以用validation_data

1
2
3
4
5
6
7
8
9
output = trainer.predict(tokenized_valid)
res = list(zip(output.predictions.reshape(-1).round(),val_df['label']))
count = len(res)
true = 0
for i, j in res:
if i==j:
true+=1
print(true/count)
list(res)

生成模型微调策略

来源: Llama 3 8b Instruct Baseline | Train

然后submission代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import gc
import torch
import argparse
import pandas as pd

from tqdm import tqdm
from peft import PeftModel
from types import SimpleNamespace
from transformers import AutoTokenizer, AutoModelForCausalLM

torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)

def main(args):
config = SimpleNamespace(
data_dir = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2',
)

model = AutoModelForCausalLM.from_pretrained(
args.model_pth,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
)
model = PeftModel.from_pretrained(model, args.lora_pth)
tokenizer = AutoTokenizer.from_pretrained(args.model_pth, padding_side='right')
tokenizer.pad_token = tokenizer.eos_token

def preprocess(sample, text=False, infer_mode=False, max_seq=args.max_length, return_tensors=None):
sys_prompt = "Please read the following essay and assign a score of 1,2,3,4,5,6 where 6 is the best. Output only a single number with no explanation.\n\n"
prompt = sample["full_text"]
if infer_mode: answer = ""
else: answer = str(sample["score"])

messages = [
{"role": "user", "content": sys_prompt + prompt},
{"role": "assistant", "content": f"\n\nThe score is: " + answer}
]
formatted_sample = tokenizer.apply_chat_template(messages, tokenize=False)
if infer_mode: formatted_sample = formatted_sample.replace("<|eot_id|>","")

tokenized_sample = tokenizer(formatted_sample, padding=True, return_tensors=return_tensors,
truncation=True, add_special_tokens=False, max_length=max_seq)

if return_tensors=="pt":
tokenized_sample["labels"] = tokenized_sample["input_ids"].clone()
else:
tokenized_sample["labels"] = tokenized_sample["input_ids"].copy()

if text: return formatted_sample
else: return tokenized_sample

df_test = pd.read_csv(f'{config.data_dir}/test.csv')
sub = pd.read_csv(f'{config.data_dir}/sample_submission.csv')

test_preds = []

for i,row in tqdm(df_test.iterrows(), total=len(df_test)):

tokenized_sample = preprocess(row, infer_mode=True, max_seq=args.max_length, return_tensors="pt")
generated_ids = model.generate(**tokenized_sample.to('cuda'), max_new_tokens=2,
pad_token_id=tokenizer.eos_token_id, do_sample=False)
decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

try:
answer = decoded[0].rsplit("The score is: ", 1)[1]
test_preds.append( int(answer) )
except:
test_preds.append( 3 )

sub.score = test_preds
sub.score = sub.score.astype('int')
sub.to_csv(args.sub_pth, index=False)

del model, tokenizer
torch.cuda.empty_cache(); gc.collect()

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model_pth", type=str, required=True, help="Path to the pretrained model" )
parser.add_argument("--lora_pth", type=str, required=True, help="Path to the PEFT LoRA adapter")
parser.add_argument("--sub_pth", type=str, required=True, help="Path to save submission file" )
parser.add_argument("--max_length", type=int, required=True, help="Max length of input sequence" )
args = parser.parse_args()

main(args)

train部分大多数就是设置微调参数, 看submission相关可以发现主要思路就是让模型来回答问题:

Please read the following essay and assign a score of 1,2,3,4,5,6 where 6 is the best. Output only a single number with no explanation.

类似的, 我们应该也可以按之前unsloth微调的策略来做处理, 这份llama3微调最后测试结果是0.773.