Spaces:
Running
Running
Felix Marty
commited on
Commit
·
35e3254
1
Parent(s):
586c827
add demo
Browse files- Dockerfile +30 -0
- app.py +81 -4
- backend.py +101 -0
- defaults.py +38 -0
- requirements.txt +1 -0
- utils.py +25 -0
Dockerfile
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
|
2 |
+
|
3 |
+
ENV PATH="/root/miniconda3/bin:${PATH}"
|
4 |
+
ARG PATH="/root/miniconda3/bin:${PATH}"
|
5 |
+
|
6 |
+
RUN apt-get update && apt-get upgrade -y
|
7 |
+
RUN apt-get install -y wget && rm -rf /var/lib/apt/lists/*
|
8 |
+
|
9 |
+
RUN wget \
|
10 |
+
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
|
11 |
+
&& mkdir /root/.conda \
|
12 |
+
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
13 |
+
&& rm -f Miniconda3-latest-Linux-x86_64.sh
|
14 |
+
|
15 |
+
RUN conda init bash
|
16 |
+
|
17 |
+
# install git
|
18 |
+
RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
|
19 |
+
git && \
|
20 |
+
apt-get clean
|
21 |
+
|
22 |
+
RUN pip install torch torchvision torchaudio
|
23 |
+
|
24 |
+
RUN pip install torchserve torch-model-archiver torch-workflow-archiver
|
25 |
+
|
26 |
+
RUN pip install transformers optimum
|
27 |
+
|
28 |
+
RUN git clone https://github.com/fxmarty/bettertransformer_demo.git
|
29 |
+
|
30 |
+
WORKDIR /workspace/bettertransformer_demo
|
app.py
CHANGED
@@ -1,7 +1,84 @@
|
|
1 |
import gradio as gr
|
2 |
|
3 |
-
|
4 |
-
|
|
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
|
3 |
+
from .defaults import defaults_vanilla_single, defaults_bt_spam, defaults_bt_single, defaults_vanilla_spam
|
4 |
+
from .defaults import ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER
|
5 |
+
from .backend import send_single, send_spam, get_message_single, get_message_spam
|
6 |
|
7 |
+
with gr.Blocks() as demo:
|
8 |
+
gr.Markdown("""
|
9 |
+
Let's try out TorchServe + BetterTransformer! This is some longer description This is some longer description This is some longer description")
|
10 |
+
|
11 |
+
## Inference using...
|
12 |
+
"""
|
13 |
+
)
|
14 |
+
|
15 |
+
with gr.Row():
|
16 |
+
with gr.Column(scale=50):
|
17 |
+
gr.Markdown("### Vanilla Transformers + TorchServe")
|
18 |
+
|
19 |
+
address_input_vanilla = gr.Textbox(
|
20 |
+
max_lines=1,
|
21 |
+
label="ip vanilla",
|
22 |
+
value=ADDRESS_VANILLA,
|
23 |
+
visible=False
|
24 |
+
)
|
25 |
+
|
26 |
+
input_model_vanilla = gr.Textbox(
|
27 |
+
max_lines=1,
|
28 |
+
label="Text",
|
29 |
+
value="Expectations were low, enjoyment was high",
|
30 |
+
)
|
31 |
+
|
32 |
+
btn_single_vanilla = gr.Button("Send single text request")
|
33 |
+
output_single_vanilla = gr.Markdown(label="Output single vanilla", value=get_message_single(**defaults_vanilla_single))
|
34 |
+
|
35 |
+
btn_spam_vanilla = gr.Button("Spam text requests (from sst2 validation set)")
|
36 |
+
output_spam_vanilla = gr.Markdown(label="Output spam vanilla", value=get_message_spam(**defaults_vanilla_spam))
|
37 |
+
|
38 |
+
btn_single_vanilla.click(
|
39 |
+
fn=send_single,
|
40 |
+
inputs=[input_model_vanilla, address_input_vanilla],
|
41 |
+
outputs=output_single_vanilla,
|
42 |
+
)
|
43 |
+
btn_spam_vanilla.click(
|
44 |
+
fn=send_spam,
|
45 |
+
inputs=[address_input_vanilla],
|
46 |
+
outputs=output_spam_vanilla,
|
47 |
+
)
|
48 |
+
|
49 |
+
with gr.Column(scale=50):
|
50 |
+
gr.Markdown("### BetterTransformer + TorchServe")
|
51 |
+
|
52 |
+
address_input_bettertransformer = gr.Textbox(
|
53 |
+
max_lines=1,
|
54 |
+
label="ip bettertransformer",
|
55 |
+
value=ADDRESS_BETTERTRANSFORMER,
|
56 |
+
visible=False
|
57 |
+
)
|
58 |
+
|
59 |
+
input_model_bettertransformer = gr.Textbox(
|
60 |
+
max_lines=1,
|
61 |
+
label="Text",
|
62 |
+
value="Expectations were low, enjoyment was high",
|
63 |
+
)
|
64 |
+
|
65 |
+
btn_single_bt = gr.Button("Send single text request")
|
66 |
+
output_single_bt = gr.Markdown(label="Output single bt", value=get_message_single(**defaults_bt_single))
|
67 |
+
|
68 |
+
btn_spam_bt = gr.Button("Spam text requests (from sst2 validation set)")
|
69 |
+
output_spam_bt = gr.Markdown(label="Output spam bt", value=get_message_spam(**defaults_bt_spam))
|
70 |
+
|
71 |
+
btn_single_bt.click(
|
72 |
+
fn=send_single,
|
73 |
+
inputs=[input_model_bettertransformer, address_input_bettertransformer],
|
74 |
+
outputs=output_single_bt,
|
75 |
+
)
|
76 |
+
|
77 |
+
btn_spam_bt.click(
|
78 |
+
fn=send_spam,
|
79 |
+
inputs=[address_input_bettertransformer],
|
80 |
+
outputs=output_spam_bt,
|
81 |
+
)
|
82 |
+
|
83 |
+
demo.queue(concurrency_count=1)
|
84 |
+
demo.launch()
|
backend.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
from .defaults import SPAM_N_REQUESTS, ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER, HEADERS
|
4 |
+
from .utils import ElapsedFuturesSession
|
5 |
+
|
6 |
+
from datasets import load_dataset
|
7 |
+
|
8 |
+
data = load_dataset("glue", "sst2", split="validation")
|
9 |
+
|
10 |
+
RETURN_MESSAGE_SINGLE = """
|
11 |
+
Inference statistics:
|
12 |
+
|
13 |
+
* Response status: {0}
|
14 |
+
* Prediction: {1}
|
15 |
+
* Inference latency (preprocessing/forward/postprocessing): {2} ms
|
16 |
+
* Peak GPU memory usage: {3} MB
|
17 |
+
* End-to-end latency (communication + pre/forward/post): {4} ms
|
18 |
+
* Padding ratio: 0.0 %
|
19 |
+
"""
|
20 |
+
|
21 |
+
RETURN_MESSAGE_SPAM = """
|
22 |
+
Processing """ + f"{SPAM_N_REQUESTS}" + """ inputs sent asynchronously. Grab a coffee.
|
23 |
+
|
24 |
+
Inference statistics:
|
25 |
+
|
26 |
+
* Promise resolution time: {0} ms
|
27 |
+
* Mean inference latency (preprocessing/forward/postprocessing): {1} ms
|
28 |
+
* Mean peak GPU memory: {2} MB
|
29 |
+
* Mean padding ratio: {3} %
|
30 |
+
* Mean sequence length: {4} tokens
|
31 |
+
"""
|
32 |
+
|
33 |
+
def get_message_single(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs):
|
34 |
+
return RETURN_MESSAGE_SINGLE.format(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency)
|
35 |
+
|
36 |
+
def get_message_spam(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length, **kwargs):
|
37 |
+
return RETURN_MESSAGE_SPAM.format(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length)
|
38 |
+
|
39 |
+
SESSION = ElapsedFuturesSession()
|
40 |
+
|
41 |
+
def send_single(input_model_vanilla, address: str):
|
42 |
+
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
|
43 |
+
|
44 |
+
promise = SESSION.post(address, headers=HEADERS, data=input_model_vanilla.encode("utf-8"))
|
45 |
+
|
46 |
+
response = promise.result() # resolve immediately
|
47 |
+
|
48 |
+
status = response.status_code
|
49 |
+
|
50 |
+
response_text = json.loads(response.text)
|
51 |
+
prediction = response_text[0]
|
52 |
+
inf_latency = response_text[1]
|
53 |
+
peak_gpu_memory = response_text[2]
|
54 |
+
end_to_end_latency = response.elapsed
|
55 |
+
|
56 |
+
return get_message_single(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency)
|
57 |
+
|
58 |
+
def send_spam(address: str):
|
59 |
+
assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
|
60 |
+
|
61 |
+
# data = "this is positive lol" #TODO: use dynamic data with padding
|
62 |
+
|
63 |
+
assert SPAM_N_REQUESTS <= len(data)
|
64 |
+
|
65 |
+
inp = data.shuffle().select(range(SPAM_N_REQUESTS))
|
66 |
+
|
67 |
+
resolution_time = 0
|
68 |
+
mean_inference_latency = 0
|
69 |
+
mean_peak_gpu_memory = 0
|
70 |
+
|
71 |
+
n_pads = 0
|
72 |
+
n_elems = 0
|
73 |
+
sequence_length = 0
|
74 |
+
|
75 |
+
promises = []
|
76 |
+
|
77 |
+
for i in range(SPAM_N_REQUESTS):
|
78 |
+
input_data = inp[i]["sentence"].encode("utf-8")
|
79 |
+
promises.append(SESSION.post(address, headers=HEADERS, data=input_data))
|
80 |
+
|
81 |
+
for promise in promises:
|
82 |
+
response = promise.result()
|
83 |
+
|
84 |
+
response_text = json.loads(response.text)
|
85 |
+
|
86 |
+
resolution_time = max(resolution_time, response.elapsed)
|
87 |
+
|
88 |
+
mean_inference_latency += response_text[1]
|
89 |
+
mean_peak_gpu_memory += response_text[2]
|
90 |
+
n_pads += response_text[3]
|
91 |
+
n_elems += response_text[4]
|
92 |
+
sequence_length += response_text[5]
|
93 |
+
|
94 |
+
mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}"
|
95 |
+
mean_sequence_length = sequence_length / SPAM_N_REQUESTS
|
96 |
+
|
97 |
+
resolution_time = round(resolution_time, 2)
|
98 |
+
mean_inference_latency = round(mean_inference_latency / SPAM_N_REQUESTS, 2)
|
99 |
+
mean_peak_gpu_memory = round(mean_peak_gpu_memory / SPAM_N_REQUESTS, 2)
|
100 |
+
|
101 |
+
return get_message_spam(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length)
|
defaults.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
defaults_vanilla_single = {
|
2 |
+
"status": 200,
|
3 |
+
"prediction": "Accepted",
|
4 |
+
"inf_latency": 20.77,
|
5 |
+
"peak_gpu_memory": 2717.36,
|
6 |
+
"end_to_end_latency": 93.65,
|
7 |
+
}
|
8 |
+
|
9 |
+
defaults_bt_single = {
|
10 |
+
"status": 200,
|
11 |
+
"prediction": "Accepted",
|
12 |
+
"inf_latency": 20.77,
|
13 |
+
"peak_gpu_memory": 2717.36,
|
14 |
+
"end_to_end_latency": 93.65,
|
15 |
+
}
|
16 |
+
|
17 |
+
defaults_vanilla_spam = {
|
18 |
+
"resolution_time": 2996.35,
|
19 |
+
"mean_inference_latency": 29.69,
|
20 |
+
"mean_peak_gpu_memory": 3620.9,
|
21 |
+
"mean_padding_ratio": 35.26,
|
22 |
+
"mean_sequence_length": 39.395,
|
23 |
+
}
|
24 |
+
|
25 |
+
defaults_bt_spam = {
|
26 |
+
"resolution_time": 2996.35,
|
27 |
+
"mean_inference_latency": 29.69,
|
28 |
+
"mean_peak_gpu_memory": 3620.9,
|
29 |
+
"mean_padding_ratio": 35.26,
|
30 |
+
"mean_sequence_length": 39.395,
|
31 |
+
}
|
32 |
+
|
33 |
+
SPAM_N_REQUESTS = 200
|
34 |
+
BATCH_SIZE = 8 # fixed!
|
35 |
+
|
36 |
+
HEADERS = {"Content-Type": "text/plain"}
|
37 |
+
ADDRESS_VANILLA = "http://3.83.142.46:8080/predictions/my_tc"
|
38 |
+
ADDRESS_BETTERTRANSFORMER = "http://3.95.36.2:8080/predictions/my_tc"
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
requests_futures
|
utils.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from requests_futures.sessions import FuturesSession
|
2 |
+
|
3 |
+
import time
|
4 |
+
|
5 |
+
class ElapsedFuturesSession(FuturesSession):
|
6 |
+
|
7 |
+
def request(self, method, url, hooks=None, *args, **kwargs):
|
8 |
+
start = time.time()
|
9 |
+
if hooks is None:
|
10 |
+
hooks = {}
|
11 |
+
|
12 |
+
def timing(r, *args, **kwargs):
|
13 |
+
r.elapsed = round((time.time() - start) * 1000, 2)
|
14 |
+
|
15 |
+
try:
|
16 |
+
if isinstance(hooks['response'], (list, tuple)):
|
17 |
+
# needs to be first so we don't time other hooks execution
|
18 |
+
hooks['response'].insert(0, timing)
|
19 |
+
else:
|
20 |
+
hooks['response'] = [timing, hooks['response']]
|
21 |
+
except KeyError:
|
22 |
+
hooks['response'] = timing
|
23 |
+
|
24 |
+
return super(ElapsedFuturesSession, self) \
|
25 |
+
.request(method, url, hooks=hooks, *args, **kwargs)
|