Spaces:

abdullahmeda
/

detect-ai-text

Running

App Files Files Community

abdullahmeda commited on Jan 24, 2024

Commit

d00901f

verified ·

1 Parent(s): bd733d0

initial commit

Browse files

Files changed (7) hide show

.gitattributes +3 -0
app.py +143 -0
data/english.pickle +3 -0
data/gpt2-large-model +3 -0
data/gpt2-medium-model +3 -0
data/gpt2-small-model +3 -0
requirements.txt +10 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/gpt2-large-model filter=lfs diff=lfs merge=lfs -text
+data/gpt2-medium-model filter=lfs diff=lfs merge=lfs -text
+data/gpt2-small-model filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import torch
+import joblib
+import numpy as np
+import pandas as pd
+import gradio as gr
+from nltk.data import load as nltk_load
+from transformers import AutoTokenizer, AutoModelForCausalLM
+NLTK = nltk_load('data/english.pickle')
+sent_cut_en = NLTK.tokenize
+clf = joblib.load(f'data/gpt2-large-model', 'rb')
+model_id  = 'gpt2-large'
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model     = AutoModelForCausalLM.from_pretrained(model_id)
+CROSS_ENTROPY = torch.nn.CrossEntropyLoss(reduction='none')
+def gpt2_features(text, tokenizer, model, sent_cut):
+    # Tokenize
+    input_max_length = tokenizer.model_max_length - 2
+    token_ids, offsets = list(), list()
+    sentences = sent_cut(text)
+    for s in sentences:
+        tokens = tokenizer.tokenize(s)
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        difference = len(token_ids) + len(ids) - input_max_length
+        if difference > 0:
+            ids = ids[:-difference]
+        offsets.append((len(token_ids), len(token_ids) + len(ids)))
+        token_ids.extend(ids)
+        if difference >= 0:
+            break
+    input_ids = torch.tensor([tokenizer.bos_token_id] + token_ids)
+    logits = model(input_ids).logits
+    # Shift so that n-1 predict n
+    shift_logits = logits[:-1].contiguous()
+    shift_target = input_ids[1:].contiguous()
+    loss = CROSS_ENTROPY(shift_logits, shift_target)
+    all_probs = torch.softmax(shift_logits, dim=-1)
+    sorted_ids = torch.argsort(all_probs, dim=-1, descending=True)  # stable=True
+    expanded_tokens = shift_target.unsqueeze(-1).expand_as(sorted_ids)
+    indices = torch.where(sorted_ids == expanded_tokens)
+    rank = indices[-1]
+    counter = [
+        rank < 10,
+        (rank >= 10) & (rank < 100),
+        (rank >= 100) & (rank < 1000),
+        rank >= 1000
+    ]
+    counter = [c.long().sum(-1).item() for c in counter]
+    # compute different-level ppl
+    text_ppl = loss.mean().exp().item()
+    sent_ppl = list()
+    for start, end in offsets:
+        nll = loss[start: end].sum() / (end - start)
+        sent_ppl.append(nll.exp().item())
+    max_sent_ppl = max(sent_ppl)
+    sent_ppl_avg = sum(sent_ppl) / len(sent_ppl)
+    if len(sent_ppl) > 1:
+        sent_ppl_std = torch.std(torch.tensor(sent_ppl)).item()
+    else:
+        sent_ppl_std = 0
+    mask = torch.tensor([1] * loss.size(0))
+    step_ppl = loss.cumsum(dim=-1).div(mask.cumsum(dim=-1)).exp()
+    max_step_ppl = step_ppl.max(dim=-1)[0].item()
+    step_ppl_avg = step_ppl.sum(dim=-1).div(loss.size(0)).item()
+    if step_ppl.size(0) > 1:
+        step_ppl_std = step_ppl.std().item()
+    else:
+        step_ppl_std = 0
+    ppls = [
+        text_ppl, max_sent_ppl, sent_ppl_avg, sent_ppl_std,
+        max_step_ppl, step_ppl_avg, step_ppl_std
+    ]
+    return ppls + counter  # type: ignore
+def predict(features, classifier, id_to_label):
+    x = np.asarray([features])
+    pred = classifier.predict(x)[0]
+    prob = classifier.predict_proba(x)[0, pred]
+    return [id_to_label[pred], prob]
+def predict(text):
+    with torch.no_grad():
+        feats = gpt2_features(text, tokenizer, model, sent_cut_en)
+    out = predict(*feats, clf, ['Human Written', 'LLM Generated'])
+    return out
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        ## ChatGPT Detector 🔬 (Linguistic version / 语言学版)
+        Visit our project on Github: [chatgpt-comparison-detection project](https://github.com/Hello-SimpleAI/chatgpt-comparison-detection)<br>
+        欢迎在 Github 上关注我们的 [ChatGPT 对比与检测项目](https://github.com/Hello-SimpleAI/chatgpt-comparison-detection)<br>
+        We provide three kinds of detectors, all in Bilingual / 我们提供了三个版本的检测器，且都支持中英文:
+        - [QA version / 问答版](https://www.modelscope.cn/studios/simpleai/chatgpt-detector-qa)<br>
+            detect whether an **answer** is generated by ChatGPT for certain **question**, using PLM-based classifiers / 判断某个**问题的回答**是否由ChatGPT生成，使用基于PTM的分类器来开发;
+        - [Sinlge-text version / 独立文本版](https://www.modelscope.cn/studios/simpleai/chatgpt-detector-single)<br>
+            detect whether a piece of text is ChatGPT generated, using PLM-based classifiers / 判断**单条文本**是否由ChatGPT生成，使用基于PTM的分类器来开发;
+        - [**Linguistic version / 语言学版** (👈 Current / 当前使用)](https://www.modelscope.cn/studios/simpleai/chatgpt-detector-ling)<br>
+            detect whether a piece of text is ChatGPT generated, using linguistic features / 判断**单条文本**是否由ChatGPT生成，使用基于语言学特征的模型来开发;
+        """
+    )
+    gr.Markdown(
+        """
+        ## Introduction:
+        Two Logistic regression models trained with two kinds of features:
+        1. [GLTR](https://aclanthology.org/P19-3019) Test-2, Language model predict token rank top-k buckets, top 10, 10-100, 100-1000, 1000+.
+        2. PPL-based, text ppl, sentence ppl, etc.
+        English LM is [GPT2-small](https://huggingface.co/gpt2).
+        Note: Providing more text to the `Text` box can make the prediction more accurate!
+        """
+    )
+    a1 = gr.Textbox(
+        lines=5, label='Text',
+        value="There are a few things that can help protect your credit card information from being misused when you give it to a restaurant or any other business:\n\nEncryption: Many businesses use encryption to protect your credit card information when it is being transmitted or stored. This means that the information is transformed into a code that is difficult for anyone to read without the right key."
+    )
+    button1 = gr.Button("🤖 Predict!")
+    gr.Markdown("GLTR")
+    label1_gltr = gr.Textbox(lines=1, label='GLTR Predicted Label 🎃')
+    score1_gltr = gr.Textbox(lines=1, label='GLTR Probability')
+    button1.click(predict, inputs=[a1], outputs=[label1_gltr, score1_gltr])
+demo.launch()

data/english.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5cad3758596392364e3be9803dbd7ebeda384b68937b488a01365f5551bb942c
+size 406697

data/gpt2-large-model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f8a8c96268cfb0b109366d85a7d26f403aabcf5d411093596f670d384b1d7a9
+size 918235392

data/gpt2-medium-model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:744b547d5540386486a4b6642be3a60b33d3d3ad12db513f82031cee5ebea6c7
+size 932091008

data/gpt2-small-model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6aab895ebfc50e1168d70df9bb62837d14f8a78938943af1d1eaee280f427d15
+size 976393968

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio==4.15.0
+joblib==1.3.2
+nltk==3.8.1
+numpy==1.26.3
+pandas==2.2.0
+torch==2.1.2
+transformers==4.37.0
+xgboost
+lightgbm
+catboost