Spaces:
Running
Running
import streamlit as st | |
from streamlit_option_menu import option_menu | |
import numpy as np | |
import os | |
import datasets | |
import argparse | |
from typing import Tuple | |
import transformers | |
import torch | |
from torch.utils.data import Dataset | |
import matplotlib as plt | |
import random | |
from tqdm import tqdm | |
import pandas as pd | |
from huggingface_hub import login | |
from torch.optim import lr_scheduler | |
from typing import Callable, Dict, List, Tuple, Union | |
import csv | |
from timeit import default_timer as timer | |
def load_tokenizer(tokenizer_name:str)->object: | |
""" | |
Function to load the tokenizer by the model's name | |
Args: | |
- tokenizer_name -> the name of the tokenizerto download | |
Returns: | |
- tokenizer -> returns respectively the model and the tokenizer | |
""" | |
tokenizer = transformers.AutoTokenizer.from_pretrained("Salesforce/codet5p-770m") | |
return tokenizer | |
def load_model(model_name:str)->object: | |
""" | |
Function for model loading | |
Args: | |
- model_name -> the name of the model | |
Returns: | |
- model,tokenizer -> returns respectively the model and the tokenizer | |
""" | |
print(f'Loading model {model_name}...') | |
model_kwargs = {} | |
model_kwargs.update(dict( torch_dtype=torch.bfloat16)) | |
transformers.T5EncoderModel._keys_to_ignore_on_load_unexpected = ["decoder.*"] | |
model_encoder = transformers.T5EncoderModel.from_pretrained("Salesforce/codet5p-770m", **model_kwargs) | |
print("---MODEL LOADED---") | |
return model_encoder | |
class stylometer_classifier(torch.nn.Module): | |
def __init__(self,pretrained_encoder,dimensionality): | |
super(stylometer_classifier, self).__init__() | |
self.modelBase = pretrained_encoder | |
self.pre_classifier = torch.nn.Linear(dimensionality, 768, dtype=torch.bfloat16) | |
self.activation = torch.nn.ReLU() | |
self.dropout = torch.nn.Dropout(0.2) | |
self.classifier = torch.nn.Linear(768, 1, dtype=torch.bfloat16) | |
def forward(self, input_ids, padding_mask): | |
output_1 = self.modelBase(input_ids=input_ids, attention_mask=padding_mask) | |
hidden_state = output_1[0] | |
#Here i take only the cls token representation for further classification | |
cls_output = hidden_state[:, 0] | |
pooler = self.pre_classifier(cls_output) | |
afterActivation = self.activation(pooler) | |
pooler_after_act = self.dropout(afterActivation) | |
output = torch.sigmoid(self.classifier(pooler_after_act)) | |
if output>=0.07: | |
return {"my_class":"It's a Human!", | |
"prob":output} | |
else: | |
return {"my_class":"It's an LLM!", | |
"prob":output} | |
return output | |
def adapt_model(model:object, dim:int=1024) -> object: | |
""" | |
This function returns the model with a classification head | |
""" | |
newModel = stylometer_classifier(model,dimensionality=dim) | |
return newModel | |
def main(): | |
print("----starting enviroment----") | |
model_name = "Salesforce/codet5p-770m" | |
checkpoint = "checkpoint.bin" | |
DEVICE = "cpu" | |
#load tokenizer | |
tokenizer = load_tokenizer(model_name) | |
print("tokenizer loaded!") | |
#loading model and tokenizer for functional translation | |
model = load_model(model_name) | |
#adding classification head to the model | |
model = adapt_model(model, dim=model.shared.embedding_dim) | |
model.load_state_dict(torch.load(checkpoint,map_location='cpu')) | |
model = model.eval() | |
st.title("Human-AI stylometer - Multilingual") | |
st.caption('From the paper: Is This You, LLM? Recognizing AI-written Programs with Multilingual Code Stylometry') | |
text = st.text_area("insert your code here") | |
button = st.button("send") | |
if button or text: | |
input = tokenizer([text]) | |
out= model(torch.tensor(input.input_ids),torch.tensor(input.attention_mask)) | |
st.write(out["my_class"]) | |
if __name__ == '__main__': | |
main() |