File size: 3,049 Bytes
5795073
 
 
 
 
 
 
 
 
 
 
2fbd2b9
0cd5e33
2fbd2b9
 
5795073
 
 
 
 
 
d332496
2fbd2b9
5795073
 
 
 
 
 
 
 
2fbd2b9
5795073
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fbd2b9
 
 
 
 
 
 
 
5795073
2fbd2b9
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import torch
import os
import numpy as np
import gradio as gr
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from datasets.dataset_dict import DatasetDict
from transformers import AdamW, T5ForConditionalGeneration, T5TokenizerFast
from tqdm.auto import tqdm
from summarizer import SummarizerModel
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
import warnings
warnings.simplefilter('ignore')

MODEL_NAME = 'Salesforce/codet5-base-multi-sum'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = SummarizerModel(MODEL_NAME)
model.load_state_dict(torch.load('codet5-base-1_epoch-val_loss-0.80.pth'))
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def summarize(text: str, 
              tokenizer = tokenizer,
              trained_model = model):
    """
    Summarizes a given code in text format.
    Args:
        text: The code in string format that needs to be summarized.
        tokenizer: The tokeniszer used in the trained T5 model.
        trained_model: A SummarizerModel fine-tuned instance of 
        T5 model family.
    """
    text_encoding = tokenizer.encode_plus(
            text,
            padding = 'max_length',
            max_length = 512,
            add_special_tokens = True,
            return_attention_mask = True,
            truncation = True,
            return_tensors = 'pt'
        )
    generated_ids = trained_model.model.generate(
        input_ids = text_encoding['input_ids'],
        attention_mask = text_encoding['attention_mask'],
        max_length = 150,
        num_beams = 2,
        repetition_penalty = 2.5,
        length_penalty = 1.0,
        early_stopping = True
    )
    preds = [tokenizer.decode(gen_id, skip_special_tokens = True,
                              clean_up_tokenization_spaces=True)
                                for gen_id in generated_ids]
    return "".join(preds)

def find_similarity_score(code_1, code_2, model = embedding_model):
    summary_code_1 = summarize(text = code_1)
    summary_code_2 = summarize(text = code_2)
    embedding_1 = model.encode(summary_code_1)
    embedding_2 = model.encode(summary_code_2)
    score = np.dot(embedding_1, embedding_2)/(np.linalg.norm(embedding_1) * np.linalg.norm(embedding_2))
    return summary_code_1, summary_code_2, round(score, 2)

outputs = gr.outputs.Textbox()
iface = gr.Interface(fn=find_similarity_score, 
                     inputs=[gr.Textbox(label = 'First Code snippet'), 
                             gr.Textbox(label = 'Second Code snippet')], 
                     outputs=[gr.Textbox(label = 'Summary of first Code snippet'), 
                              gr.Textbox(label = 'Summary of second Code snippet'),
                              gr.Textbox(label = 'The similarity score')],
                     description='The similarity score')
iface.launch()