!pip intall numpy pandas FlagEmbedding scikit-learn

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
from FlagEmbedding import FlagReranker
import json

In [None]:
model_path = '...'
qd_df = pd.read_parquet('AutoRAG-example-korean-embedding-benchmark/data/qa_v4.parquet')
qd_df['retrieval_gt'] = qd_df['retrieval_gt'].apply(lambda x : x[0][0])

corpus_df = pd.read_parquet('AutoRAG-example-korean-embedding-benchmark/data/ocr_corpus_v3.parquet')
corpus_id = {}
for idx, row in corpus_df.iterrows():
    corpus_id[row[0]] =  row[1]


In [2]:
qd_df = qd_df[['qid','query','generation_gt','retrieval_gt']]

query_id = {}
for idx, row in qd_df.iterrows():
    query_id[row[0]] =  row[1]

qrel = qd_df[['qid','retrieval_gt']]
qrel_id = {}
for idx, row in qrel.iterrows():
    q_id = row.iloc[0]
    relevant_copus_id = row.iloc[1]
    if q_id not in qrel_id:
        qrel_id[q_id] = set()
    qrel_id[q_id].add(relevant_copus_id)

corpus_df = corpus_df[['doc_id','contents']]

valid_dict = {}
valid_dict['qrel'] =qrel_id

  corpus_id[row[0]] =  row[1]
  query_id[row[0]] =  row[1]


In [3]:
corpus_df.iloc[0]

doc_id                          commerce - B2BDigComm.pdf - 1
contents    Adobe\n디지털 커머스 시대,\nB2B 비즈니스 생존 전략\nB2B 비즈니스를 ...
Name: 0, dtype: object

In [4]:
qd_df.columns

Index(['qid', 'query', 'generation_gt', 'retrieval_gt'], dtype='object')

In [7]:
corpus_df = corpus_df.reset_index(drop=True)
qd_df = qd_df.reset_index(drop=True)

In [None]:
def calculate_accuracy(ranks_list, valid_dict, qd_df, k_values=[1, 3, 5]):
    accuracies = {k: 0 for k in k_values}
    total_queries = len(qd_df)
    
    for i in range(total_queries):
        search_idx = ranks_list[i]
        true_doc_idx = corpus_df[corpus_df['doc_id'] == list(valid_dict['qrel'][qd_df.loc[i, 'qid']])[0]].index[0]
        
        for k in k_values:
            top_k_preds = search_idx[:k]
            if true_doc_idx in top_k_preds:
                accuracies[k] += 1
    
    return {k: accuracies[k] / total_queries for k in k_values}

def calculate_f1_recall_precision(ranks_list, valid_dict, qd_df, k_values=[1, 3, 5]):
    f1_scores = {k: 0 for k in k_values}
    recall_scores = {k: 0 for k in k_values}
    precision_scores = {k: 0 for k in k_values}
    
    total_queries = len(qd_df)
    
    for i in range(total_queries):
        search_idx = ranks_list[i]
        true_doc_idx = corpus_df[corpus_df['doc_id'] == list(valid_dict['qrel'][qd_df.loc[i, 'qid']])[0]].index[0]
        
        for k in k_values:
            top_k_preds = search_idx[:k]
            y_true = [1 if idx == true_doc_idx else 0 for idx in top_k_preds]
            y_pred = [1] * len(top_k_preds)
            
            # Precision, Recall, F1
            precision_scores[k] += precision_score(y_true, y_pred)
            recall_scores[k] += recall_score(y_true, y_pred)
            f1_scores[k] += f1_score(y_true, y_pred)
    
    return {k: f1_scores[k] / total_queries for k in k_values}, \
           {k: recall_scores[k] / total_queries for k in k_values}, \
           {k: precision_scores[k] / total_queries for k in k_values}


def evaluate_model(corpus_df, qd_df, valid_dict, reranker):
    scores_list = []
    ranks_list = []
    
    for c, query in enumerate(qd_df['query'], start=1):
        corpus_df['query'] = query
        pair_df = corpus_df[['query', 'contents']]
        scores = reranker.compute_score(pair_df.values.tolist(), normalize=True)
        scores = np.array(scores)
        
        sorted_idxs = np.argsort(-scores)
        scores_list.append(scores[sorted_idxs])
        ranks_list.append(sorted_idxs)
        print(f'{c}/{len(qd_df)}')

    k_values = [1, 3, 5, 10]
    accuracies = calculate_accuracy(ranks_list, valid_dict, qd_df, k_values=k_values)
    f1_scores, recalls, precisions = calculate_f1_recall_precision(ranks_list, valid_dict, qd_df, k_values=k_values)
    
    return accuracies, f1_scores, recalls, precisions


# 모델 평가 
reranker = FlagReranker(model_path, use_fp16=True)

accuracies, f1_scores, recalls, precisions = evaluate_model(
    corpus_df.copy(), qd_df, valid_dict, reranker)

print(f'Model: {model_path}')
for k in [1, 3, 5, 10]:
    print(f'Accuracy@{k}: {accuracies[k]:.4f}')
    print(f'F1@{k}: {f1_scores[k]:.4f}')
    print(f'Recall@{k}: {recalls[k]:.4f}')
    print(f'Precision@{k}: {precisions[k]:.4f}')
