from flask import Flask, request from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig from transformers import AutoTokenizer, AutoModelForSequenceClassification from transformers import RobertaConfig from torch import cuda import torch import gradio as gr import os import re import pdfplumber app = Flask(__name__) ACCESS_TOKEN = os.environ["ACCESS_TOKEN"] # config = RobertaConfig.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN) # model = RobertaForSequenceClassification.from_pretrained("PirateXX/ChatGPT-Text-Detector", use_auth_token= ACCESS_TOKEN, config = config) # model_name = "roberta-base" # tokenizer = RobertaTokenizer.from_pretrained(model_name, map_location=torch.device('cpu')) tokenizer = AutoTokenizer.from_pretrained("PirateXX/AI-Content-Detector", use_auth_token= ACCESS_TOKEN) model = AutoModelForSequenceClassification.from_pretrained("PirateXX/AI-Content-Detector", use_auth_token= ACCESS_TOKEN) # function to break text into an array of sentences def text_to_sentences(text): return re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', text) # function to concatenate sentences into chunks of size 900 or less def chunks_of_900(text, chunk_size=900): sentences = text_to_sentences(text) chunks = [] current_chunk = "" for sentence in sentences: if len(current_chunk + sentence) <= chunk_size: if len(current_chunk)!=0: current_chunk += " "+sentence else: current_chunk += sentence else: chunks.append(current_chunk) current_chunk = sentence chunks.append(current_chunk) return chunks def predict(query, device="cpu"): tokens = tokenizer.encode(query) all_tokens = len(tokens) tokens = tokens[:tokenizer.model_max_length - 2] used_tokens = len(tokens) tokens = torch.tensor([tokenizer.bos_token_id] + tokens + [tokenizer.eos_token_id]).unsqueeze(0) mask = torch.ones_like(tokens) with torch.no_grad(): logits = model(tokens.to(device), attention_mask=mask.to(device))[0] probs = logits.softmax(dim=-1) fake, real = probs.detach().cpu().flatten().numpy().tolist() return real def findRealProb(text): chunksOfText = (chunks_of_900(text)) results = [] for chunk in chunksOfText: output = predict(chunk) results.append([output, len(chunk)]) ans = 0 cnt=0 for prob, length in results: ans = ans + prob*length cnt+=length realProb = ans/cnt return {"Real": realProb, "Fake": 1-realProb, "results": results, "text": text} def upload_file(file): if file: pdf_file = file.name text = "" with pdfplumber.open(pdf_file) as pdf: cnt = 0 for page in pdf.pages: cnt+=1 text+=(page.extract_text(x_tolerance = 1)) if cnt>5: break text = text.replace('\n', ' ') return findRealProb(text) else: return {"error":'No PDF file found in request'} demo = gr.Interface( fn=upload_file, inputs=gr.File(), article = "Visit AI Content Detector for better user experience!", outputs=gr.outputs.JSON(), interpretation="default",) demo.launch(show_api=False)