-
Notifications
You must be signed in to change notification settings - Fork 0
/
ragas_eval.py
121 lines (99 loc) · 4.33 KB
/
ragas_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import requests
import json
import os
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from langchain_openai import ChatOpenAI
from ragas.metrics.critique import harmfulness
from ragas import evaluate
from ragas import adapt
from ragas.metrics import (
answer_relevancy,
faithfulness,
context_recall,
context_precision,
context_relevancy,
answer_correctness,
answer_similarity
)
class RAGAs_Eval():
def __init__(self):
openai_model = ChatOpenAI(model_name="gpt-4")
print("正在初始化RAGAs评分系统, 首次运行等待时间较长.....")
adapt(metrics=[answer_correctness, faithfulness, answer_relevancy, context_recall, context_precision], language="chinese", llm=openai_model)
# 因为RAGAs系统使用GPT打分, 有最大tokens限制, 需要保证Top-k个contexts没有超过tokens限制。
def max_k(self, data, k, max_tokens=13000):
while True:
# 选取前k个context
if k > len(data['contexts']):
k = len(data['contexts'])
data['contexts'] = data['contexts'][:k]
contents = ''
for response in data['contexts']:
for res in response:
contents+=res
tokens = len(contents)
# maximum tokens is 16385, set 13000 as maximum context token limits
if tokens<max_tokens:
break
k-=1
return data, k
# 遍历所有result文件夹中的内容,生成一个eval_dataset, 据此打分
def top_k_ragas_eval(self, question_list, contexts_list, answer_list, ground_truth_list, k=10):
scores = []
for index, question in enumerate(question_list):
print(f'Question {index+1}/{len(question_list)}')
data = {"question": question, "contexts":contexts_list[index], "ground_truth":ground_truth_list[index], 'answer':answer_list[index]}
# 验证top-k个contexts是否超过max_tokens = 16385, 并选取前k个context
data, new_k = self.max_k(data, k)
if new_k!=k:
print(f"Top-{k} contexts 超过最大字符限制, 本次结果 '{question}' 自动更改为Top-{new_k}")
eval_dataset = Dataset.from_pandas(pd.DataFrame([data]))
# 使用RAGAs评分系统打分,最后共输出4个分数
result = evaluate(
eval_dataset,
metrics=[
answer_relevancy,
faithfulness,
context_recall,
context_precision,
],
)
scores.append(result)
return scores
# 将每一个问题的分数保存到./result/result.xlsx中
def save(self, scores):
results = []
for s in scores:
results.append(s.to_pandas())
result = pd.concat(results).reset_index(drop=True)
save_path = './result'
file_name = 'result.xlsx'
if not os.path.exists(save_path):
os.mkdir(save_path)
result.to_excel(os.path.join(save_path, file_name))
def run(self, question_list, contexts_list, answer_list, ground_truth_list, k=10):
scores = self.top_k_ragas_eval(question_list, contexts_list, answer_list, ground_truth_list, k)
self.save(scores)
return scores
if __name__ == '__main__':
company_name_list = ['东方航空公司', '优刻得科技股份有限公司', '优刻得科技股份有限公司']
question_body_list = ['主营业务', '主营业务', '该企业的未来发展规划是什么']
question_list = []
contexts_list = []
answer_list = []
ground_truth_list = []
data_directory = './example_full'
# 加载数据
for file_name in os.listdir(data_directory):
file_path = os.path.join(data_directory, file_name)
if os.path.isfile(file_path) and file_name.endswith('.json'):
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
question_list.append(data["question"])
contexts_list.append(data["contexts"])
answer_list.append(data["answer"])
ground_truth_list.append(data["ground_truth"])
eval = RAGAs_Eval()
score = eval.run(question_list, contexts_list, answer_list, ground_truth_list, k=10)