File size: 3,593 Bytes
e350168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gensim
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch


def classify_by_topic(articles, central_topics):

    # 计算每篇文章与每个中心主题的相似度,返回一个矩阵
    def compute_similarity(articles, central_topics):

        model = AutoModel.from_pretrained("distilbert-base-multilingual-cased")
        tokenizer = AutoTokenizer.from_pretrained(
            "distilbert-base-multilingual-cased")

        # 将一个句子转换为一个向量
        def sentence_to_vector(sentence, context):
            # 分词并添加特殊标记
            sentence = context[0]+context[1]+sentence*4+context[2]+context[3]
            tokens = tokenizer.encode_plus(
                sentence, add_special_tokens=True, return_tensors="pt")
            # 获取每个词的隐藏状态向量
            outputs = model(**tokens)
            hidden_states = outputs.last_hidden_state
            # 计算平均向量作为句子向量
            vector = np.squeeze(torch.mean(
                hidden_states, dim=1).detach().numpy())  # a 1 x d tensor
            return vector

        # 获取一个句子的上下文
        def get_context(sentences, index):
            if index == 0:
                prev_sentence = ""
                pprev_sentence = ""
            elif index == 1:
                prev_sentence = sentences[index-1]
                pprev_sentence = ""
            else:
                prev_sentence = sentences[index-1]
                pprev_sentence = sentences[index-2]
            if index == len(sentences) - 1:
                next_sentence = ""
                nnext_sentence = ""
            elif index == len(sentences) - 2:
                next_sentence = sentences[index+1]
                nnext_sentence = ""
            else:
                next_sentence = sentences[index+1]
                nnext_sentence = sentences[index+2]
            return (pprev_sentence, prev_sentence, next_sentence, nnext_sentence)

        # 将每个文章句子和每个中心句子转换为向量
        doc_vectors = [sentence_to_vector(sentence, get_context(
            articles, i)) for i, sentence in enumerate(articles)]
        topic_vectors = [sentence_to_vector(sentence, get_context(
            central_topics, i)) for i, sentence in enumerate(central_topics)]
        # 计算每个文章句子和每个中心句子之间的余弦相似度矩阵
        cos_sim_matrix = cosine_similarity(doc_vectors, topic_vectors)

        # print(cos_sim_matrix)
        return cos_sim_matrix

    # 按照相似度矩阵分类文章,返回一个列表
    def group_by_topic(articles, central_topics, similarity_matrix):
        group = []
        original_articles = articles.copy()  # 保存一份原始的文章列表
        # 用原始的文章列表替换预处理后的文章列表
        for article, similarity in zip(original_articles, similarity_matrix):
            max_similarity = max(similarity)  # 取最高的相似度值
            max_index = similarity.tolist().index(max_similarity)  # 取最高相似度值对应的索引
            # print(max_similarity,max_index )
            group.append((article, central_topics[max_index]))

        return group

    # 实现分类功能
    similarity_matrix = compute_similarity(articles, central_topics)
    groups = group_by_topic(articles, central_topics, similarity_matrix)

    # 返回分类后的列表
    return groups