philschmid HF staff commited on
Commit
5afebd5
·
1 Parent(s): a5cf62b

make requests async and in parallel

Browse files
Files changed (1) hide show
  1. app.py +39 -15
app.py CHANGED
@@ -1,19 +1,23 @@
1
  import os
2
-
3
- import gradio as gr
4
  import requests
 
5
 
6
- TOKEN = os.environ.get("API_TOKEN")
7
-
8
- UL2_API_URL = "https://api-inference.huggingface.co/models/google/flan-ul2"
9
- FLAN_API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
10
 
11
- headers = {"Authorization": f"Bearer {TOKEN}"}
12
- MAX_NEW_TOKENS = 256
 
 
 
 
13
 
14
- def query(text, api_url):
15
- response = requests.post(api_url, headers=headers, json={"inputs":text, "parameters": {"max_new_tokens":MAX_NEW_TOKENS}})
16
- return response.json()
 
 
 
17
 
18
 
19
  examples = [
@@ -35,10 +39,30 @@ Q: A juggler can juggle 16 balls. Half of the balls are golf balls, and half of
35
  title = "Flan UL2 vs Flan T5 XXL"
36
  description = "This demo compares [Flan-T5-xxl](https://huggingface.co/google/flan-t5-xxl) and [Flan-UL2](https://huggingface.co/google/flan-ul2). Learn more about these models in their model card!"
37
 
38
- def inference(text):
39
- output_ul2 = query(text, api_url=UL2_API_URL)[0]["generated_text"]
40
- output_flan = query(text, api_url=FLAN_API_URL)[0]["generated_text"]
41
- return [output_ul2, output_flan]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  io = gr.Interface(
44
  inference,
 
1
  import os
2
+ import asyncio
3
+ from concurrent.futures import ThreadPoolExecutor
4
  import requests
5
+ import gradio as gr
6
 
 
 
 
 
7
 
8
+ MAX_NEW_TOKENS = 128
9
+ TOKEN = os.environ.get("API_TOKEN",None)
10
+ URLS = [
11
+ "https://api-inference.huggingface.co/models/google/flan-ul2",
12
+ "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
13
+ ]
14
 
15
+ def fetch(session, text, api_url):
16
+ model = api_url.split("/")[-1]
17
+ response = session.post(api_url, json={"inputs":text, "parameters": {"max_new_tokens":MAX_NEW_TOKENS}})
18
+ if response.status_code != 200:
19
+ return None
20
+ return model, response.json()
21
 
22
 
23
  examples = [
 
39
  title = "Flan UL2 vs Flan T5 XXL"
40
  description = "This demo compares [Flan-T5-xxl](https://huggingface.co/google/flan-t5-xxl) and [Flan-UL2](https://huggingface.co/google/flan-ul2). Learn more about these models in their model card!"
41
 
42
+ async def inference(text):
43
+ with ThreadPoolExecutor(max_workers=2) as executor:
44
+ with requests.Session() as session:
45
+ session.headers = {"Authorization": f"Bearer {TOKEN}"}
46
+ # Initialize the event loop
47
+ loop = asyncio.get_event_loop()
48
+ tasks = [
49
+ loop.run_in_executor(
50
+ executor,
51
+ fetch,
52
+ *(session, text, url) # Allows us to pass in multiple arguments to `fetch`
53
+ )
54
+ for url in urls
55
+ ]
56
+
57
+ # Initializes the tasks to run and awaits their results
58
+ responses = [None, None]
59
+ for (model, response) in await asyncio.gather(*tasks):
60
+ if response is not None:
61
+ if model == "flan-ul2":
62
+ responses[0] = response
63
+ elif model == "flan-t5-xxl":
64
+ responses[1] = response
65
+ return responses
66
 
67
  io = gr.Interface(
68
  inference,