add data files
Browse files- Dockerfile +14 -0
- bi_encoder.py +54 -0
- corssencode_inference.py +57 -0
- data/BBT_sheldon_all.csv +0 -0
- data/corpus.pkl +3 -0
- data/dataset.pkl +3 -0
- data/sentences.pkl +3 -0
- main.py +35 -0
- model.py +16 -0
- requirements.txt +0 -0
- templates/index.html +36 -0
Dockerfile
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
+
# you will also find guides on how best to write your Dockerfile
|
3 |
+
|
4 |
+
FROM python:3.9
|
5 |
+
|
6 |
+
WORKDIR /code
|
7 |
+
|
8 |
+
COPY ./requirements.txt /code/requirements.txt
|
9 |
+
|
10 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
11 |
+
|
12 |
+
COPY . .
|
13 |
+
|
14 |
+
CMD ["gunicorn", "-b", "0.0.0.0:7860", "main:app"]
|
bi_encoder.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import pickle
|
3 |
+
import torch
|
4 |
+
from transformers import AutoTokenizer, AutoModel
|
5 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
+
|
7 |
+
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
8 |
+
bert_model = AutoModel.from_pretrained("distilbert-base-uncased")
|
9 |
+
|
10 |
+
|
11 |
+
def mean_pool(token_embeds: torch.tensor, attention_mask: torch.tensor) -> torch.tensor:
|
12 |
+
in_mask = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
|
13 |
+
pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(in_mask.sum(1), min=1e-9)
|
14 |
+
return pool
|
15 |
+
|
16 |
+
|
17 |
+
def encode(input_texts: list[str], tokenizer: AutoTokenizer, model: AutoModel, device: str = "cpu"
|
18 |
+
) -> torch.tensor:
|
19 |
+
|
20 |
+
model.eval()
|
21 |
+
tokenized_texts = tokenizer(input_texts, max_length=512,
|
22 |
+
padding='max_length', truncation=True, return_tensors="pt")
|
23 |
+
token_embeds = model(tokenized_texts["input_ids"].to(device),
|
24 |
+
tokenized_texts["attention_mask"].to(device)).last_hidden_state
|
25 |
+
pooled_embeds = mean_pool(token_embeds, tokenized_texts["attention_mask"].to(device))
|
26 |
+
return pooled_embeds
|
27 |
+
|
28 |
+
|
29 |
+
with open('data/sentences.pkl', 'rb') as f:
|
30 |
+
sentences = pickle.load(f)
|
31 |
+
|
32 |
+
with open('data/corpus.pkl', 'rb') as f:
|
33 |
+
corpus = pickle.load(f)
|
34 |
+
|
35 |
+
df = pd.DataFrame.from_dict(sentences)
|
36 |
+
df['corpus'] = corpus
|
37 |
+
|
38 |
+
|
39 |
+
def get_question(context: str, question: str):
|
40 |
+
cont_quest = f"{context} [Cont_token] {question}"
|
41 |
+
pooled_embeds = encode(cont_quest, tokenizer, bert_model, "cpu")
|
42 |
+
pooled_embeds = pooled_embeds.cpu().detach().numpy()
|
43 |
+
return pooled_embeds
|
44 |
+
|
45 |
+
|
46 |
+
def cosine_sim(question, embed):
|
47 |
+
return cosine_similarity(question, embed)[0][0]
|
48 |
+
|
49 |
+
|
50 |
+
def get_corpus(context: str, question: str):
|
51 |
+
question_embed = get_question(context, question)
|
52 |
+
df['cosine_similarity'] = df.apply(lambda x: cosine_sim(question_embed, x['embeds']), axis=1)
|
53 |
+
corp = df.sort_values(by=['cosine_similarity'], ascending=False).head(10)['corpus'].tolist()
|
54 |
+
return corp
|
corssencode_inference.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Tuple, Any
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
from transformers import AutoTokenizer
|
5 |
+
from bi_encoder import get_corpus, get_question
|
6 |
+
|
7 |
+
import torch
|
8 |
+
|
9 |
+
from model import CrossEncoderBert
|
10 |
+
|
11 |
+
|
12 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
+
|
14 |
+
model = CrossEncoderBert()
|
15 |
+
model.model.resize_token_embeddings(len(model.tokenizer))
|
16 |
+
model.load_state_dict(torch.load('model/torch_model', map_location=torch.device(device)))
|
17 |
+
model.tokenizer = AutoTokenizer.from_pretrained('model/tokenizer')
|
18 |
+
model.to(device)
|
19 |
+
|
20 |
+
|
21 |
+
def get_range_answers(
|
22 |
+
context: str,
|
23 |
+
question: str,
|
24 |
+
num_answers: int = 5) -> list[str]:
|
25 |
+
|
26 |
+
corpus = get_corpus(context, question)
|
27 |
+
context_question = f'{context} [Cont_token] {question}'
|
28 |
+
context_questions = [context_question] * len(corpus)
|
29 |
+
tokenized_texts = model.tokenizer(
|
30 |
+
context_questions,
|
31 |
+
corpus,
|
32 |
+
max_length=512,
|
33 |
+
padding=True,
|
34 |
+
truncation=True,
|
35 |
+
return_tensors='pt'
|
36 |
+
).to(device)
|
37 |
+
|
38 |
+
with torch.no_grad():
|
39 |
+
ce_scores = model(tokenized_texts['input_ids'],
|
40 |
+
tokenized_texts['attention_mask']).squeeze(-1)
|
41 |
+
ce_scores = torch.sigmoid(ce_scores)
|
42 |
+
|
43 |
+
scores = ce_scores.cpu().numpy()
|
44 |
+
scores_ix = np.argsort(scores)[::-1]
|
45 |
+
best_answers = []
|
46 |
+
for idx in scores_ix[:num_answers]:
|
47 |
+
best_answers.append((scores[idx], corpus[idx]))
|
48 |
+
|
49 |
+
best_answers = [str(x[1]) for x in best_answers]
|
50 |
+
return best_answers
|
51 |
+
|
52 |
+
|
53 |
+
def get_best_answer(
|
54 |
+
context: str,
|
55 |
+
question: str
|
56 |
+
) -> str:
|
57 |
+
return get_range_answers(context, question, 1)[0][1]
|
data/BBT_sheldon_all.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/corpus.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c6dc57f8da9666e2889503c73a6ab21d85f38a9fcd1650a289468ca2a06841c8
|
3 |
+
size 1070772
|
data/dataset.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef2ff6bc63bafa936eb743eb3bbfdd9ebd3192e8ba9e1bbe212cf53093a478a7
|
3 |
+
size 3360049
|
data/sentences.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9fb9226181cc8dacaac4ea03baf363be24a5df81d1f8ce70fa85b7b71016c4ef
|
3 |
+
size 37335519
|
main.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, render_template, request, jsonify
|
2 |
+
from corssencode_inference import get_range_answers, get_best_answer
|
3 |
+
|
4 |
+
app = Flask(__name__)
|
5 |
+
|
6 |
+
|
7 |
+
@app.route('/', methods=['GET', 'POST'])
|
8 |
+
def index():
|
9 |
+
if request.method == 'POST':
|
10 |
+
if request.form.get('get_answer') == 'One answer':
|
11 |
+
one_answer = get_best_answer(request.form.get('context'), request.form.get('question'))
|
12 |
+
|
13 |
+
return jsonify(
|
14 |
+
{
|
15 |
+
"response_code": "200",
|
16 |
+
"request": f"{request.form.get('context')} [Cont_token] {request.form.get('question')}",
|
17 |
+
"response": one_answer
|
18 |
+
}
|
19 |
+
)
|
20 |
+
elif request.form.get('get_answer_corpus') == 'Five answer':
|
21 |
+
many_answer = get_range_answers(request.form.get('context'), request.form.get('question'))
|
22 |
+
|
23 |
+
return jsonify(
|
24 |
+
{
|
25 |
+
"response_code": "200",
|
26 |
+
"request": f"{request.form.get('context')} [Cont_token] {request.form.get('question')}",
|
27 |
+
"response": many_answer
|
28 |
+
}
|
29 |
+
)
|
30 |
+
elif request.method == 'GET':
|
31 |
+
return render_template('index.html')
|
32 |
+
|
33 |
+
|
34 |
+
if __name__ == '__main__':
|
35 |
+
app.run('localhost', 5000)
|
model.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModel
|
2 |
+
import torch
|
3 |
+
|
4 |
+
|
5 |
+
class CrossEncoderBert(torch.nn.Module):
|
6 |
+
def __init__(self):
|
7 |
+
super().__init__()
|
8 |
+
self.model = AutoModel.from_pretrained('distilbert-base-uncased')
|
9 |
+
self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
|
10 |
+
self.tokenizer.add_tokens(["[Cont_token]"], special_tokens=True)
|
11 |
+
self.linear = torch.nn.Linear(self.model.config.hidden_size, 1)
|
12 |
+
|
13 |
+
def forward(self, input_ids, attention_mask):
|
14 |
+
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
|
15 |
+
pooled_output = outputs.last_hidden_state[:, 0]
|
16 |
+
return self.linear(pooled_output)
|
requirements.txt
ADDED
Binary file (5.45 kB). View file
|
|
templates/index.html
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>Chat with Sheldon</title>
|
7 |
+
</head>
|
8 |
+
<body>
|
9 |
+
<h2 style="font-style:italic; text-align:center">Chat bot Sheldon</h2>
|
10 |
+
|
11 |
+
<p style="text-align:center">Home work 1</p>
|
12 |
+
|
13 |
+
<p style="text-align:center">by Petrov DE</p>
|
14 |
+
|
15 |
+
<hr />
|
16 |
+
<div>
|
17 |
+
<p style="text-align:center">Answer: {{ answer }}</p>
|
18 |
+
</div>
|
19 |
+
<form method="post" action="/" style="text-align:center">
|
20 |
+
<label>
|
21 |
+
Context:
|
22 |
+
</label>
|
23 |
+
<label>
|
24 |
+
<input type="text" name="context"/>
|
25 |
+
</label>
|
26 |
+
<label>
|
27 |
+
Question:
|
28 |
+
</label>
|
29 |
+
<label>
|
30 |
+
<input type="text" name="question"/>
|
31 |
+
</label>
|
32 |
+
<input type="submit" value="One answer" name="get_answer"/>
|
33 |
+
<input type="submit" value="Five answer" name="get_answer_corpus" />
|
34 |
+
</form>
|
35 |
+
</body>
|
36 |
+
</html>
|