Felix Marty commited on
Commit
35e3254
·
1 Parent(s): 586c827
Files changed (6) hide show
  1. Dockerfile +30 -0
  2. app.py +81 -4
  3. backend.py +101 -0
  4. defaults.py +38 -0
  5. requirements.txt +1 -0
  6. utils.py +25 -0
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
2
+
3
+ ENV PATH="/root/miniconda3/bin:${PATH}"
4
+ ARG PATH="/root/miniconda3/bin:${PATH}"
5
+
6
+ RUN apt-get update && apt-get upgrade -y
7
+ RUN apt-get install -y wget && rm -rf /var/lib/apt/lists/*
8
+
9
+ RUN wget \
10
+ https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
11
+ && mkdir /root/.conda \
12
+ && bash Miniconda3-latest-Linux-x86_64.sh -b \
13
+ && rm -f Miniconda3-latest-Linux-x86_64.sh
14
+
15
+ RUN conda init bash
16
+
17
+ # install git
18
+ RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
19
+ git && \
20
+ apt-get clean
21
+
22
+ RUN pip install torch torchvision torchaudio
23
+
24
+ RUN pip install torchserve torch-model-archiver torch-workflow-archiver
25
+
26
+ RUN pip install transformers optimum
27
+
28
+ RUN git clone https://github.com/fxmarty/bettertransformer_demo.git
29
+
30
+ WORKDIR /workspace/bettertransformer_demo
app.py CHANGED
@@ -1,7 +1,84 @@
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
 
3
+ from .defaults import defaults_vanilla_single, defaults_bt_spam, defaults_bt_single, defaults_vanilla_spam
4
+ from .defaults import ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER
5
+ from .backend import send_single, send_spam, get_message_single, get_message_spam
6
 
7
+ with gr.Blocks() as demo:
8
+ gr.Markdown("""
9
+ Let's try out TorchServe + BetterTransformer! This is some longer description This is some longer description This is some longer description")
10
+
11
+ ## Inference using...
12
+ """
13
+ )
14
+
15
+ with gr.Row():
16
+ with gr.Column(scale=50):
17
+ gr.Markdown("### Vanilla Transformers + TorchServe")
18
+
19
+ address_input_vanilla = gr.Textbox(
20
+ max_lines=1,
21
+ label="ip vanilla",
22
+ value=ADDRESS_VANILLA,
23
+ visible=False
24
+ )
25
+
26
+ input_model_vanilla = gr.Textbox(
27
+ max_lines=1,
28
+ label="Text",
29
+ value="Expectations were low, enjoyment was high",
30
+ )
31
+
32
+ btn_single_vanilla = gr.Button("Send single text request")
33
+ output_single_vanilla = gr.Markdown(label="Output single vanilla", value=get_message_single(**defaults_vanilla_single))
34
+
35
+ btn_spam_vanilla = gr.Button("Spam text requests (from sst2 validation set)")
36
+ output_spam_vanilla = gr.Markdown(label="Output spam vanilla", value=get_message_spam(**defaults_vanilla_spam))
37
+
38
+ btn_single_vanilla.click(
39
+ fn=send_single,
40
+ inputs=[input_model_vanilla, address_input_vanilla],
41
+ outputs=output_single_vanilla,
42
+ )
43
+ btn_spam_vanilla.click(
44
+ fn=send_spam,
45
+ inputs=[address_input_vanilla],
46
+ outputs=output_spam_vanilla,
47
+ )
48
+
49
+ with gr.Column(scale=50):
50
+ gr.Markdown("### BetterTransformer + TorchServe")
51
+
52
+ address_input_bettertransformer = gr.Textbox(
53
+ max_lines=1,
54
+ label="ip bettertransformer",
55
+ value=ADDRESS_BETTERTRANSFORMER,
56
+ visible=False
57
+ )
58
+
59
+ input_model_bettertransformer = gr.Textbox(
60
+ max_lines=1,
61
+ label="Text",
62
+ value="Expectations were low, enjoyment was high",
63
+ )
64
+
65
+ btn_single_bt = gr.Button("Send single text request")
66
+ output_single_bt = gr.Markdown(label="Output single bt", value=get_message_single(**defaults_bt_single))
67
+
68
+ btn_spam_bt = gr.Button("Spam text requests (from sst2 validation set)")
69
+ output_spam_bt = gr.Markdown(label="Output spam bt", value=get_message_spam(**defaults_bt_spam))
70
+
71
+ btn_single_bt.click(
72
+ fn=send_single,
73
+ inputs=[input_model_bettertransformer, address_input_bettertransformer],
74
+ outputs=output_single_bt,
75
+ )
76
+
77
+ btn_spam_bt.click(
78
+ fn=send_spam,
79
+ inputs=[address_input_bettertransformer],
80
+ outputs=output_spam_bt,
81
+ )
82
+
83
+ demo.queue(concurrency_count=1)
84
+ demo.launch()
backend.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from .defaults import SPAM_N_REQUESTS, ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER, HEADERS
4
+ from .utils import ElapsedFuturesSession
5
+
6
+ from datasets import load_dataset
7
+
8
+ data = load_dataset("glue", "sst2", split="validation")
9
+
10
+ RETURN_MESSAGE_SINGLE = """
11
+ Inference statistics:
12
+
13
+ * Response status: {0}
14
+ * Prediction: {1}
15
+ * Inference latency (preprocessing/forward/postprocessing): {2} ms
16
+ * Peak GPU memory usage: {3} MB
17
+ * End-to-end latency (communication + pre/forward/post): {4} ms
18
+ * Padding ratio: 0.0 %
19
+ """
20
+
21
+ RETURN_MESSAGE_SPAM = """
22
+ Processing """ + f"{SPAM_N_REQUESTS}" + """ inputs sent asynchronously. Grab a coffee.
23
+
24
+ Inference statistics:
25
+
26
+ * Promise resolution time: {0} ms
27
+ * Mean inference latency (preprocessing/forward/postprocessing): {1} ms
28
+ * Mean peak GPU memory: {2} MB
29
+ * Mean padding ratio: {3} %
30
+ * Mean sequence length: {4} tokens
31
+ """
32
+
33
+ def get_message_single(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency, **kwargs):
34
+ return RETURN_MESSAGE_SINGLE.format(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency)
35
+
36
+ def get_message_spam(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length, **kwargs):
37
+ return RETURN_MESSAGE_SPAM.format(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length)
38
+
39
+ SESSION = ElapsedFuturesSession()
40
+
41
+ def send_single(input_model_vanilla, address: str):
42
+ assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
43
+
44
+ promise = SESSION.post(address, headers=HEADERS, data=input_model_vanilla.encode("utf-8"))
45
+
46
+ response = promise.result() # resolve immediately
47
+
48
+ status = response.status_code
49
+
50
+ response_text = json.loads(response.text)
51
+ prediction = response_text[0]
52
+ inf_latency = response_text[1]
53
+ peak_gpu_memory = response_text[2]
54
+ end_to_end_latency = response.elapsed
55
+
56
+ return get_message_single(status, prediction, inf_latency, peak_gpu_memory, end_to_end_latency)
57
+
58
+ def send_spam(address: str):
59
+ assert address in [ADDRESS_VANILLA, ADDRESS_BETTERTRANSFORMER]
60
+
61
+ # data = "this is positive lol" #TODO: use dynamic data with padding
62
+
63
+ assert SPAM_N_REQUESTS <= len(data)
64
+
65
+ inp = data.shuffle().select(range(SPAM_N_REQUESTS))
66
+
67
+ resolution_time = 0
68
+ mean_inference_latency = 0
69
+ mean_peak_gpu_memory = 0
70
+
71
+ n_pads = 0
72
+ n_elems = 0
73
+ sequence_length = 0
74
+
75
+ promises = []
76
+
77
+ for i in range(SPAM_N_REQUESTS):
78
+ input_data = inp[i]["sentence"].encode("utf-8")
79
+ promises.append(SESSION.post(address, headers=HEADERS, data=input_data))
80
+
81
+ for promise in promises:
82
+ response = promise.result()
83
+
84
+ response_text = json.loads(response.text)
85
+
86
+ resolution_time = max(resolution_time, response.elapsed)
87
+
88
+ mean_inference_latency += response_text[1]
89
+ mean_peak_gpu_memory += response_text[2]
90
+ n_pads += response_text[3]
91
+ n_elems += response_text[4]
92
+ sequence_length += response_text[5]
93
+
94
+ mean_padding_ratio = f"{n_pads / n_elems * 100:.2f}"
95
+ mean_sequence_length = sequence_length / SPAM_N_REQUESTS
96
+
97
+ resolution_time = round(resolution_time, 2)
98
+ mean_inference_latency = round(mean_inference_latency / SPAM_N_REQUESTS, 2)
99
+ mean_peak_gpu_memory = round(mean_peak_gpu_memory / SPAM_N_REQUESTS, 2)
100
+
101
+ return get_message_spam(resolution_time, mean_inference_latency, mean_peak_gpu_memory, mean_padding_ratio, mean_sequence_length)
defaults.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults_vanilla_single = {
2
+ "status": 200,
3
+ "prediction": "Accepted",
4
+ "inf_latency": 20.77,
5
+ "peak_gpu_memory": 2717.36,
6
+ "end_to_end_latency": 93.65,
7
+ }
8
+
9
+ defaults_bt_single = {
10
+ "status": 200,
11
+ "prediction": "Accepted",
12
+ "inf_latency": 20.77,
13
+ "peak_gpu_memory": 2717.36,
14
+ "end_to_end_latency": 93.65,
15
+ }
16
+
17
+ defaults_vanilla_spam = {
18
+ "resolution_time": 2996.35,
19
+ "mean_inference_latency": 29.69,
20
+ "mean_peak_gpu_memory": 3620.9,
21
+ "mean_padding_ratio": 35.26,
22
+ "mean_sequence_length": 39.395,
23
+ }
24
+
25
+ defaults_bt_spam = {
26
+ "resolution_time": 2996.35,
27
+ "mean_inference_latency": 29.69,
28
+ "mean_peak_gpu_memory": 3620.9,
29
+ "mean_padding_ratio": 35.26,
30
+ "mean_sequence_length": 39.395,
31
+ }
32
+
33
+ SPAM_N_REQUESTS = 200
34
+ BATCH_SIZE = 8 # fixed!
35
+
36
+ HEADERS = {"Content-Type": "text/plain"}
37
+ ADDRESS_VANILLA = "http://3.83.142.46:8080/predictions/my_tc"
38
+ ADDRESS_BETTERTRANSFORMER = "http://3.95.36.2:8080/predictions/my_tc"
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ requests_futures
utils.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from requests_futures.sessions import FuturesSession
2
+
3
+ import time
4
+
5
+ class ElapsedFuturesSession(FuturesSession):
6
+
7
+ def request(self, method, url, hooks=None, *args, **kwargs):
8
+ start = time.time()
9
+ if hooks is None:
10
+ hooks = {}
11
+
12
+ def timing(r, *args, **kwargs):
13
+ r.elapsed = round((time.time() - start) * 1000, 2)
14
+
15
+ try:
16
+ if isinstance(hooks['response'], (list, tuple)):
17
+ # needs to be first so we don't time other hooks execution
18
+ hooks['response'].insert(0, timing)
19
+ else:
20
+ hooks['response'] = [timing, hooks['response']]
21
+ except KeyError:
22
+ hooks['response'] = timing
23
+
24
+ return super(ElapsedFuturesSession, self) \
25
+ .request(method, url, hooks=hooks, *args, **kwargs)