chansung commited on
Commit
928f123
Β·
1 Parent(s): 5b0b914
Files changed (12) hide show
  1. .gitignore +1 -0
  2. README.md +5 -6
  3. app.py +449 -99
  4. constants/prompts.toml +17 -0
  5. date_iterator.sh +27 -0
  6. gen/gemini.py +142 -0
  7. gen/utils.py +37 -0
  8. outputs.json +0 -0
  9. paper/download.py +102 -0
  10. paper/parser.py +57 -0
  11. requirements.txt +9 -0
  12. utils.py +28 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
README.md CHANGED
@@ -1,14 +1,13 @@
1
  ---
2
- title: Paper Q&A
3
- emoji: πŸ€“πŸ“ƒ
4
- colorFrom: red
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 4.19.2
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
- short_description: Explore papers with auto generated Q&As!
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Test Paperqa
3
+ emoji: πŸ”₯
4
+ colorFrom: indigo
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 4.20.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,12 +1,34 @@
1
- import gradio as gr
 
2
  import copy
3
  import datasets
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  STYLE = """
6
 
7
- .main {
8
- width: 90% !important;
9
- margin: 0 auto; /* Center the container */
 
 
10
  }
11
 
12
  .small-font{
@@ -16,7 +38,7 @@ STYLE = """
16
  .small-font:hover {
17
  font-size: 20px !important;
18
  transition: font-size 0.3s ease-out;
19
- transition-delay: 0.8s;
20
  }
21
 
22
  .group {
@@ -50,22 +72,207 @@ STYLE = """
50
  border-radius: 0px;
51
  }
52
 
53
- #search_input > label > span {
 
 
 
 
54
  display: none;
55
  }
56
 
57
- #exp-type > span {
58
  display: none;
59
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  """
61
 
 
 
 
62
  dataset_repo_id = "chansung/auto-paper-qa2"
 
 
63
  ds = datasets.load_dataset(dataset_repo_id)
 
 
 
 
 
 
64
 
65
  title2qna = {}
66
  date2qna = {}
67
  longest_qans = 0
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def count_nans(row):
70
  count = 0
71
 
@@ -119,33 +326,33 @@ def set_paper(date, paper_title):
119
  return (
120
  gr.Markdown(f"# {selected_paper['title']}"), gr.Markdown(selected_paper["summary"]),
121
 
122
- gr.Markdown(f"## πŸ™‹ {selected_paper['0_question']}"),
123
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['0_answers:eli5']}"),
124
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['0_answers:expert']}"),
125
- gr.Markdown(f"## πŸ™‹πŸ™‹ {selected_paper['0_additional_depth_q:follow up question']}"),
126
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['0_additional_depth_q:answers:eli5']}"),
127
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['0_additional_depth_q:answers:expert']}"),
128
- gr.Markdown(f"## πŸ™‹πŸ™‹ {selected_paper['0_additional_breath_q:follow up question']}"),
129
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['0_additional_breath_q:answers:eli5']}"),
130
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['0_additional_breath_q:answers:expert']}"),
131
 
132
- gr.Markdown(f"## πŸ™‹ {selected_paper['1_question']}"),
133
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['1_answers:eli5']}"),
134
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['1_answers:expert']}"),
135
- gr.Markdown(f"## πŸ™‹πŸ™‹ {selected_paper['1_additional_depth_q:follow up question']}"),
136
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['1_additional_depth_q:answers:eli5']}"),
137
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['1_additional_depth_q:answers:expert']}"),
138
- gr.Markdown(f"## πŸ™‹πŸ™‹ {selected_paper['1_additional_breath_q:follow up question']}"),
139
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['1_additional_breath_q:answers:eli5']}"),
140
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['1_additional_breath_q:answers:expert']}"),
141
 
142
- gr.Markdown(f"## πŸ™‹ {selected_paper['2_question']}"),
143
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['2_answers:eli5']}"),
144
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['2_answers:expert']}"),
145
- gr.Markdown(f"## πŸ™‹πŸ™‹ {selected_paper['2_additional_depth_q:follow up question']}"),
146
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['2_additional_depth_q:answers:eli5']}"),
147
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['2_additional_depth_q:answers:expert']}"),
148
- gr.Markdown(f"## πŸ™‹πŸ™‹ {selected_paper['2_additional_breath_q:follow up question']}"),
149
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['2_additional_breath_q:answers:eli5']}"),
150
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['2_additional_breath_q:answers:expert']}"),
151
  )
@@ -196,7 +403,7 @@ function search(searchIn, maxResults = 3) {{
196
  let titles = {list(titles)};
197
 
198
  for (const title of titles) {{ // Assuming 'titles' is an array defined elsewhere
199
- if (results.length > 3) {{
200
  break;
201
  }} else {{
202
  if (title.toLowerCase().includes(searchIn.toLowerCase())) {{ // JavaScript's equivalent to Python's 'in'
@@ -206,7 +413,7 @@ function search(searchIn, maxResults = 3) {{
206
  }}
207
 
208
  // Handle UI elements (Explanation below)
209
- const resultElements = [1, 2, 3].map(index => {{
210
  return results[index - 1] || '';
211
  }});
212
 
@@ -228,13 +435,74 @@ function search(searchIn, maxResults = 3) {{
228
  document.getElementById('search_r3').style.display = 'block';
229
  }}
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  return resultElements;
232
  }} else {{
233
  document.getElementById('search_r1').style.display = 'none';
234
  document.getElementById('search_r2').style.display = 'none';
235
  document.getElementById('search_r3').style.display = 'none';
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
- return ['', '', '']
 
 
 
 
 
 
 
238
  }}
239
  }}
240
  """
@@ -251,7 +519,7 @@ def set_papers(date, title):
251
  gr.Textbox("")
252
  )
253
 
254
- with gr.Blocks(css=STYLE) as demo:
255
  gr.Markdown("# Let's explore papers with auto generated Q&As")
256
 
257
  with gr.Column(elem_classes=["group"]):
@@ -272,108 +540,164 @@ with gr.Blocks(css=STYLE) as demo:
272
  )
273
 
274
  with gr.Column(elem_classes=["no-gap"]):
275
- search_in = gr.Textbox("", placeholder="Enter keywords to search...", elem_id="search_input")
276
  search_r1 = gr.Button(visible=False, elem_id="search_r1", elem_classes=["no-radius"])
277
  search_r2 = gr.Button(visible=False, elem_id="search_r2", elem_classes=["no-radius"])
278
  search_r3 = gr.Button(visible=False, elem_id="search_r3", elem_classes=["no-radius"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
- title = gr.Markdown(f"# {selected_paper['title']}")
281
- summary = gr.Markdown(f"{selected_paper['summary']}", elem_classes=["small-font"])
282
-
283
- with gr.Row():
284
- with gr.Column(scale=7):
285
- gr.Markdown("## Auto generated Questions & Answers")
286
-
287
- exp_type = gr.Radio(choices=["ELI5", "Technical"], value="ELI5", elem_id="exp-type", scale=3)
288
-
289
- # 1
290
- with gr.Column(elem_classes=["group"], visible=True) as q_0:
291
- basic_q_0 = gr.Markdown(f"## πŸ™‹ {selected_paper['0_question']}")
292
- basic_q_eli5_0 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['0_answers:eli5']}", elem_classes=["small-font"])
293
- basic_q_expert_0 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['0_answers:expert']}", visible=False, elem_classes=["small-font"])
294
-
295
- with gr.Accordion("Additional question #1", open=False, elem_classes=["accordion"]) as aq_0_0:
296
- depth_q_0 = gr.Markdown(f"## πŸ™‹πŸ™‹ {selected_paper['0_additional_depth_q:follow up question']}")
297
- depth_q_eli5_0 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['0_additional_depth_q:answers:eli5']}", elem_classes=["small-font"])
298
- depth_q_expert_0 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['0_additional_depth_q:answers:expert']}", visible=False, elem_classes=["small-font"])
299
-
300
- with gr.Accordion("Additional question #2", open=False, elem_classes=["accordion"]) as aq_0_1:
301
- breath_q_0 = gr.Markdown(f"## πŸ™‹πŸ™‹ {selected_paper['0_additional_breath_q:follow up question']}")
302
- breath_q_eli5_0 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['0_additional_breath_q:answers:eli5']}", elem_classes=["small-font"])
303
- breath_q_expert_0 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['0_additional_breath_q:answers:expert']}", visible=False, elem_classes=["small-font"])
304
-
305
- # 2
306
- with gr.Column(elem_classes=["group"], visible=True) as q_1:
307
- basic_q_1 = gr.Markdown(f"## πŸ™‹ {selected_paper['1_question']}")
308
- basic_q_eli5_1 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['1_answers:eli5']}", elem_classes=["small-font"])
309
- basic_q_expert_1 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['1_answers:expert']}", visible=False, elem_classes=["small-font"])
310
-
311
- with gr.Accordion("Additional question #1", open=False, elem_classes=["accordion"]) as aq_1_0:
312
- depth_q_1 = gr.Markdown(f"## πŸ™‹πŸ™‹ {selected_paper['1_additional_depth_q:follow up question']}")
313
- depth_q_eli5_1 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['1_additional_depth_q:answers:eli5']}", elem_classes=["small-font"])
314
- depth_q_expert_1 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['1_additional_depth_q:answers:expert']}", visible=False, elem_classes=["small-font"])
315
-
316
- with gr.Accordion("Additional question #2", open=False, elem_classes=["accordion"]) as aq_1_1:
317
- breath_q_1 = gr.Markdown(f"## πŸ™‹πŸ™‹ {selected_paper['1_additional_breath_q:follow up question']}")
318
- breath_q_eli5_1 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['1_additional_breath_q:answers:eli5']}", elem_classes=["small-font"])
319
- breath_q_expert_1 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['1_additional_breath_q:answers:expert']}", visible=False, elem_classes=["small-font"])
320
-
321
- # 3
322
- with gr.Column(elem_classes=["group"], visible=True) as q_2:
323
- basic_q_2 = gr.Markdown(f"## πŸ™‹ {selected_paper['2_question']}")
324
- basic_q_eli5_2 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['2_answers:eli5']}", elem_classes=["small-font"])
325
- basic_q_expert_2 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['2_answers:expert']}", visible=False, elem_classes=["small-font"])
326
-
327
- with gr.Accordion("Additional question #1", open=False, elem_classes=["accordion"]) as aq_2_0:
328
- depth_q_2 = gr.Markdown(f"## πŸ™‹πŸ™‹ {selected_paper['2_additional_depth_q:follow up question']}")
329
- depth_q_eli5_2 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['2_additional_depth_q:answers:eli5']}", elem_classes=["small-font"])
330
- depth_q_expert_2 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['2_additional_depth_q:answers:expert']}", visible=False, elem_classes=["small-font"])
331
-
332
- with gr.Accordion("Additional question #2", open=False, elem_classes=["accordion"]) as aq_2_1:
333
- breath_q_2 = gr.Markdown(f"## πŸ™‹πŸ™‹ {selected_paper['2_additional_breath_q:follow up question']}")
334
- breath_q_eli5_2 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['2_additional_breath_q:answers:eli5']}", elem_classes=["small-font"])
335
- breath_q_expert_2 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['2_additional_breath_q:answers:expert']}", visible=False, elem_classes=["small-font"])
336
 
337
  gr.Markdown("The target papers are collected from [Hugging Face πŸ€— Daily Papers](https://huggingface.co/papers) on a daily basis. "
338
  "The entire data is generated by [Google's Gemini 1.0](https://deepmind.google/technologies/gemini/) Pro. "
339
  "If you are curious how it is done, visit the [Auto Paper Q&A Generation project repository](https://github.com/deep-diver/auto-paper-analysis) "
340
  "Also, the generated dataset is hosted on Hugging Face πŸ€— Dataset repository as well([Link](https://huggingface.co/datasets/chansung/auto-paper-qa2)). ")
341
 
342
- search_r1.click(
343
- set_date,
344
- search_r1,
345
- date_dd
346
- ).then(
347
  set_papers,
348
  inputs=[date_dd, search_r1],
349
  outputs=[papers_dd, search_in]
350
  )
351
 
352
- search_r2.click(
353
- set_date,
354
- search_r2,
355
- date_dd
356
- ).then(
357
  set_papers,
358
  inputs=[date_dd, search_r2],
359
  outputs=[papers_dd, search_in]
360
  )
361
 
362
- search_r3.click(
363
- set_date,
364
- search_r3,
365
- date_dd
366
- ).then(
367
  set_papers,
368
  inputs=[date_dd, search_r3],
369
  outputs=[papers_dd, search_in]
370
  )
371
 
372
- date_dd.input(
373
- get_papers,
374
- date_dd,
375
- papers_dd
376
- ).then(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  set_paper,
378
  [date_dd, papers_dd],
379
  [
@@ -413,7 +737,10 @@ with gr.Blocks(css=STYLE) as demo:
413
 
414
  search_in.change(
415
  inputs=[search_in],
416
- outputs=[search_r1, search_r2, search_r3],
 
 
 
417
  js=UPDATE_SEARCH_RESULTS,
418
  fn=None
419
  )
@@ -428,4 +755,27 @@ with gr.Blocks(css=STYLE) as demo:
428
  ]
429
  )
430
 
431
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
  import copy
4
  import datasets
5
+ import pandas as pd
6
+ import gradio as gr
7
+
8
+ from datetime import datetime, timedelta
9
+ from datasets import Dataset
10
+ from huggingface_hub import HfApi
11
+ from huggingface_hub import create_repo
12
+ from huggingface_hub.utils import HfHubHTTPError
13
+
14
+ from paper.download import (
15
+ download_pdf_from_arxiv,
16
+ get_papers_from_hf_daily_papers,
17
+ get_papers_from_arxiv_ids
18
+ )
19
+ from paper.parser import extract_text_and_figures
20
+ from gen.gemini import get_basic_qa, get_deep_qa
21
+ import utils
22
+
23
+ from apscheduler.schedulers.background import BackgroundScheduler
24
 
25
  STYLE = """
26
 
27
+ @media only screen and (max-width: 700px) {
28
+ .main {
29
+ width: 80% !important;
30
+ margin: 0 auto; /* Center the container */
31
+ }
32
  }
33
 
34
  .small-font{
 
38
  .small-font:hover {
39
  font-size: 20px !important;
40
  transition: font-size 0.3s ease-out;
41
+ transition-delay: 1.5s;
42
  }
43
 
44
  .group {
 
72
  border-radius: 0px;
73
  }
74
 
75
+ .textbox-no-label > label > span {
76
+ display: none;
77
+ }
78
+
79
+ .exp-type > span {
80
  display: none;
81
  }
82
 
83
+ .conv-type > span {
84
  display: none;
85
  }
86
+
87
+ .conv-type .wrap:nth-child(3) {
88
+ width: 167px;
89
+ margin: auto;
90
+ }
91
+
92
+ button {
93
+ font-size: 10pt !important;
94
+ }
95
+
96
+ h3 {
97
+ font-size: 13pt !important;
98
+ }
99
  """
100
 
101
+ gemini_api_key = os.getenv("GEMINI_API_KEY")
102
+ hf_token = os.getenv("HF_TOKEN")
103
+
104
  dataset_repo_id = "chansung/auto-paper-qa2"
105
+ request_arxiv_repo_id="chansung/requested-arxiv-ids-3"
106
+
107
  ds = datasets.load_dataset(dataset_repo_id)
108
+ request_ds = datasets.load_dataset(request_arxiv_repo_id)
109
+ requested_arxiv_ids = []
110
+ for request_d in request_ds['train']:
111
+ arxiv_ids = request_d['Requested arXiv IDs']
112
+ requested_arxiv_ids = requested_arxiv_ids + arxiv_ids
113
+ requested_arxiv_ids_df = pd.DataFrame({'Requested arXiv IDs': requested_arxiv_ids})
114
 
115
  title2qna = {}
116
  date2qna = {}
117
  longest_qans = 0
118
 
119
+ def filter_function(example, ids):
120
+ ids_e = example['Requested arXiv IDs']
121
+ for iid in ids:
122
+ if iid in ids_e:
123
+ ids_e.remove(iid)
124
+ example['Requested arXiv IDs'] = ids_e
125
+
126
+ print(example)
127
+ return example
128
+
129
+ def process_arxiv_ids(gemini_api, hf_repo_id, req_hf_repo_id, hf_token, how_many=10):
130
+ arxiv_ids = []
131
+
132
+ ds1 = datasets.load_dataset(req_hf_repo_id)
133
+ for d in ds1['train']:
134
+ req_arxiv_ids = d['Requested arXiv IDs']
135
+ if len(req_arxiv_ids) > 0 and req_arxiv_ids[0] != "top":
136
+ arxiv_ids = arxiv_ids + req_arxiv_ids
137
+
138
+ arxiv_ids = arxiv_ids[:how_many]
139
+
140
+ if arxiv_ids is not None and len(arxiv_ids) > 0:
141
+ print(f"1. Get metadata for the papers [{arxiv_ids}]")
142
+ papers = get_papers_from_arxiv_ids(arxiv_ids)
143
+ print("...DONE")
144
+
145
+ print("2. Generating QAs for the paper")
146
+ for paper in papers:
147
+ try:
148
+ title = paper['title']
149
+ target_date = paper['target_date']
150
+ abstract = paper['paper']['summary']
151
+ arxiv_id = paper['paper']['id']
152
+ authors = paper['paper']['authors']
153
+
154
+ print(f"...PROCESSING ON[{arxiv_id}, {title}]")
155
+ print(f"......Downloading the paper PDF")
156
+ filename = download_pdf_from_arxiv(arxiv_id)
157
+ print(f"......DONE")
158
+
159
+ print(f"......Extracting text and figures")
160
+ texts, figures = extract_text_and_figures(filename)
161
+ text =' '.join(texts)
162
+ print(f"......DONE")
163
+
164
+ print(f"......Generating the seed(basic) QAs")
165
+ qnas = get_basic_qa(text, gemini_api_key=gemini_api, trucate=30000)
166
+ qnas['title'] = title
167
+ qnas['abstract'] = abstract
168
+ qnas['authors'] = ','.join(authors)
169
+ qnas['arxiv_id'] = arxiv_id
170
+ qnas['target_date'] = target_date
171
+ qnas['full_text'] = text
172
+ print(f"......DONE")
173
+
174
+ print(f"......Generating the follow-up QAs")
175
+ qnas = get_deep_qa(text, qnas, gemini_api_key=gemini_api, trucate=30000)
176
+ del qnas["qna"]
177
+ print(f"......DONE")
178
+
179
+ print(f"......Exporting to HF Dataset repo at [{hf_repo_id}]")
180
+ utils.push_to_hf_hub(qnas, hf_repo_id, hf_token)
181
+ print(f"......DONE")
182
+
183
+ print(f"......Updating request arXiv HF Dataset repo at [{req_hf_repo_id}]")
184
+ ds1 = ds1['train'].map(
185
+ lambda example: filter_function(example, [arxiv_id])
186
+ ).filter(
187
+ lambda example: len(example['Requested arXiv IDs']) > 0
188
+ )
189
+ ds1.push_to_hub(req_hf_repo_id, token=hf_token)
190
+
191
+ print(f"......DONE")
192
+ except Exception as e:
193
+ print(f".......failed due to exception {e}")
194
+ continue
195
+
196
+ HfApi(token=hf_token).restart_space(
197
+ repo_id="chansung/paper_qa", token=hf_token
198
+ )
199
+
200
+ def push_to_hf_hub(
201
+ df, repo_id, token, append=True
202
+ ):
203
+ exist = False
204
+ ds = Dataset.from_pandas(df)
205
+
206
+ try:
207
+ create_repo(request_arxiv_repo_id, repo_type="dataset", token=hf_token)
208
+ except HfHubHTTPError as e:
209
+ exist = True
210
+
211
+ if exist and append:
212
+ existing_ds = datasets.load_dataset(repo_id)
213
+ ds = datasets.concatenate_datasets([existing_ds['train'], ds])
214
+
215
+ ds.push_to_hub(repo_id, token=token)
216
+
217
+ def _filter_duplicate_arxiv_ids(arxiv_ids_to_be_added):
218
+ ds1 = datasets.load_dataset("chansung/requested-arxiv-ids-3")
219
+ ds2 = datasets.load_dataset("chansung/auto-paper-qa2")
220
+
221
+ unique_arxiv_ids = set()
222
+
223
+ for d in ds1['train']:
224
+ arxiv_ids = d['Requested arXiv IDs']
225
+ unique_arxiv_ids = set(list(unique_arxiv_ids) + arxiv_ids)
226
+
227
+ for d in ds2['train']:
228
+ arxiv_id = d['arxiv_id']
229
+ unique_arxiv_ids.add(arxiv_id)
230
+
231
+ return list(set(arxiv_ids_to_be_added) - unique_arxiv_ids)
232
+
233
+ def _is_arxiv_id_valid(arxiv_id):
234
+ pattern = r"^\d{4}\.\d{5}$"
235
+ return bool(re.match(pattern, arxiv_id))
236
+
237
+ def _get_valid_arxiv_ids(arxiv_ids_str):
238
+ valid_arxiv_ids = []
239
+ invalid_arxiv_ids = []
240
+
241
+ for arxiv_id in arxiv_ids_str.split(","):
242
+ arxiv_id = arxiv_id.strip()
243
+ if _is_arxiv_id_valid(arxiv_id):
244
+ valid_arxiv_ids.append(arxiv_id)
245
+ else:
246
+ invalid_arxiv_ids.append(arxiv_id)
247
+
248
+ return valid_arxiv_ids, invalid_arxiv_ids
249
+
250
+ def add_arxiv_ids_to_queue(queue, arxiv_ids_str):
251
+ print(0)
252
+ valid_arxiv_ids, invalid_arxiv_ids = _get_valid_arxiv_ids(arxiv_ids_str)
253
+ print("01")
254
+
255
+ if len(invalid_arxiv_ids) > 0:
256
+ gr.Warning(f"found invalid arXiv ids as in {invalid_arxiv_ids}")
257
+
258
+ if len(valid_arxiv_ids) > 0:
259
+ valid_arxiv_ids = _filter_duplicate_arxiv_ids(valid_arxiv_ids)
260
+
261
+ if len(valid_arxiv_ids) > 0:
262
+ valid_arxiv_ids = [[arxiv_id] for arxiv_id in valid_arxiv_ids]
263
+ gr.Warning(f"Processing on [{valid_arxiv_ids}]. Other requested arXiv IDs not found on this list should be already processed or being processed...")
264
+ valid_arxiv_ids = pd.DataFrame({'Requested arXiv IDs': valid_arxiv_ids})
265
+ queue = pd.concat([queue, valid_arxiv_ids])
266
+ queue.reset_index(drop=True)
267
+
268
+ push_to_hf_hub(valid_arxiv_ids, request_arxiv_repo_id, hf_token)
269
+ else:
270
+ gr.Warning(f"All requested arXiv IDs are already processed or being processed...")
271
+ else:
272
+ gr.Warning(f"No valid arXiv IDs found...")
273
+
274
+ return queue
275
+
276
  def count_nans(row):
277
  count = 0
278
 
 
326
  return (
327
  gr.Markdown(f"# {selected_paper['title']}"), gr.Markdown(selected_paper["summary"]),
328
 
329
+ gr.Markdown(f"### πŸ™‹ {selected_paper['0_question']}"),
330
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['0_answers:eli5']}"),
331
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['0_answers:expert']}"),
332
+ gr.Markdown(f"### πŸ™‹πŸ™‹ {selected_paper['0_additional_depth_q:follow up question']}"),
333
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['0_additional_depth_q:answers:eli5']}"),
334
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['0_additional_depth_q:answers:expert']}"),
335
+ gr.Markdown(f"### πŸ™‹πŸ™‹ {selected_paper['0_additional_breath_q:follow up question']}"),
336
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['0_additional_breath_q:answers:eli5']}"),
337
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['0_additional_breath_q:answers:expert']}"),
338
 
339
+ gr.Markdown(f"### πŸ™‹ {selected_paper['1_question']}"),
340
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['1_answers:eli5']}"),
341
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['1_answers:expert']}"),
342
+ gr.Markdown(f"### πŸ™‹πŸ™‹ {selected_paper['1_additional_depth_q:follow up question']}"),
343
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['1_additional_depth_q:answers:eli5']}"),
344
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['1_additional_depth_q:answers:expert']}"),
345
+ gr.Markdown(f"### πŸ™‹πŸ™‹ {selected_paper['1_additional_breath_q:follow up question']}"),
346
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['1_additional_breath_q:answers:eli5']}"),
347
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['1_additional_breath_q:answers:expert']}"),
348
 
349
+ gr.Markdown(f"### πŸ™‹ {selected_paper['2_question']}"),
350
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['2_answers:eli5']}"),
351
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['2_answers:expert']}"),
352
+ gr.Markdown(f"### πŸ™‹πŸ™‹ {selected_paper['2_additional_depth_q:follow up question']}"),
353
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['2_additional_depth_q:answers:eli5']}"),
354
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['2_additional_depth_q:answers:expert']}"),
355
+ gr.Markdown(f"### πŸ™‹πŸ™‹ {selected_paper['2_additional_breath_q:follow up question']}"),
356
  gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['2_additional_breath_q:answers:eli5']}"),
357
  gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['2_additional_breath_q:answers:expert']}"),
358
  )
 
403
  let titles = {list(titles)};
404
 
405
  for (const title of titles) {{ // Assuming 'titles' is an array defined elsewhere
406
+ if (results.length > 10) {{
407
  break;
408
  }} else {{
409
  if (title.toLowerCase().includes(searchIn.toLowerCase())) {{ // JavaScript's equivalent to Python's 'in'
 
413
  }}
414
 
415
  // Handle UI elements (Explanation below)
416
+ const resultElements = [1,2,3,4,5,6,7,8,9,10].map(index => {{
417
  return results[index - 1] || '';
418
  }});
419
 
 
435
  document.getElementById('search_r3').style.display = 'block';
436
  }}
437
 
438
+ if (resultElements[3] == '') {{
439
+ document.getElementById('search_r4').style.display = 'none';
440
+ }} else {{
441
+ document.getElementById('search_r4').style.display = 'block';
442
+ }}
443
+
444
+ if (resultElements[4] == '') {{
445
+ document.getElementById('search_r5').style.display = 'none';
446
+ }} else {{
447
+ document.getElementById('search_r5').style.display = 'block';
448
+ }}
449
+
450
+ if (resultElements[5] == '') {{
451
+ document.getElementById('search_r6').style.display = 'none';
452
+ }} else {{
453
+ document.getElementById('search_r6').style.display = 'block';
454
+ }}
455
+
456
+ if (resultElements[6] == '') {{
457
+ document.getElementById('search_r7').style.display = 'none';
458
+ }} else {{
459
+ document.getElementById('search_r7').style.display = 'block';
460
+ }}
461
+
462
+ if (resultElements[7] == '') {{
463
+ document.getElementById('search_r8').style.display = 'none';
464
+ }} else {{
465
+ document.getElementById('search_r8').style.display = 'block';
466
+ }}
467
+
468
+ if (resultElements[8] == '') {{
469
+ document.getElementById('search_r9').style.display = 'none';
470
+ }} else {{
471
+ document.getElementById('search_r9').style.display = 'block';
472
+ }}
473
+
474
+ if (resultElements[9] == '') {{
475
+ document.getElementById('search_r10').style.display = 'none';
476
+ }} else {{
477
+ document.getElementById('search_r10').style.display = 'block';
478
+ }}
479
+
480
  return resultElements;
481
  }} else {{
482
  document.getElementById('search_r1').style.display = 'none';
483
  document.getElementById('search_r2').style.display = 'none';
484
  document.getElementById('search_r3').style.display = 'none';
485
+ document.getElementById('search_r4').style.display = 'none';
486
+ document.getElementById('search_r5').style.display = 'none';
487
+ document.getElementById('search_r6').style.display = 'none';
488
+ document.getElementById('search_r7').style.display = 'none';
489
+ document.getElementById('search_r8').style.display = 'none';
490
+ document.getElementById('search_r9').style.display = 'none';
491
+ document.getElementById('search_r10').style.display = 'none';
492
+
493
+ return ['', '', '', '', '', '', '', '', '', '']
494
+ }}
495
+ }}
496
+ """
497
 
498
+ UPDATE_IF_TYPE = f"""
499
+ function chage_if_type(if_type) {{
500
+ if (if_type == 'Q&As') {{
501
+ document.getElementById('chat_block').style.display = 'none';
502
+ document.getElementById('qna_block').style.display = 'block';
503
+ }} else {{
504
+ document.getElementById('chat_block').style.display = 'block';
505
+ document.getElementById('qna_block').style.display = 'none';
506
  }}
507
  }}
508
  """
 
519
  gr.Textbox("")
520
  )
521
 
522
+ with gr.Blocks(css=STYLE, theme=gr.themes.Soft()) as demo:
523
  gr.Markdown("# Let's explore papers with auto generated Q&As")
524
 
525
  with gr.Column(elem_classes=["group"]):
 
540
  )
541
 
542
  with gr.Column(elem_classes=["no-gap"]):
543
+ search_in = gr.Textbox("", placeholder="Enter keywords to search...", elem_classes=["textbox-no-label"])
544
  search_r1 = gr.Button(visible=False, elem_id="search_r1", elem_classes=["no-radius"])
545
  search_r2 = gr.Button(visible=False, elem_id="search_r2", elem_classes=["no-radius"])
546
  search_r3 = gr.Button(visible=False, elem_id="search_r3", elem_classes=["no-radius"])
547
+ search_r4 = gr.Button(visible=False, elem_id="search_r4", elem_classes=["no-radius"])
548
+ search_r5 = gr.Button(visible=False, elem_id="search_r5", elem_classes=["no-radius"])
549
+ search_r6 = gr.Button(visible=False, elem_id="search_r6", elem_classes=["no-radius"])
550
+ search_r7 = gr.Button(visible=False, elem_id="search_r7", elem_classes=["no-radius"])
551
+ search_r8 = gr.Button(visible=False, elem_id="search_r8", elem_classes=["no-radius"])
552
+ search_r9 = gr.Button(visible=False, elem_id="search_r9", elem_classes=["no-radius"])
553
+ search_r10 = gr.Button(visible=False, elem_id="search_r10", elem_classes=["no-radius"])
554
+
555
+ conv_type = gr.Radio(choices=["Q&As", "Chat"], value="Q&As", interactive=True, visible=False, elem_classes=["conv-type"])
556
+
557
+ with gr.Column(scale=7):
558
+ title = gr.Markdown(f"# {selected_paper['title']}")
559
+ summary = gr.Markdown(f"{selected_paper['summary']}", elem_classes=["small-font"])
560
+
561
+ with gr.Column(elem_id="chat_block", visible=False):
562
+ gr.Chatbot([("hello", "world"), ("how", "are you?")])
563
+
564
+ with gr.Column(elem_id="qna_block", visible=True):
565
+ with gr.Row():
566
+ with gr.Column(scale=7):
567
+ gr.Markdown("## Auto generated Questions & Answers")
568
+
569
+ exp_type = gr.Radio(choices=["ELI5", "Technical"], value="ELI5", elem_classes=["exp-type"], scale=3)
570
+
571
+ # 1
572
+ with gr.Column(elem_classes=["group"], visible=True) as q_0:
573
+ basic_q_0 = gr.Markdown(f"### πŸ™‹ {selected_paper['0_question']}")
574
+ basic_q_eli5_0 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['0_answers:eli5']}", elem_classes=["small-font"])
575
+ basic_q_expert_0 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['0_answers:expert']}", visible=False, elem_classes=["small-font"])
576
+
577
+ with gr.Accordion("Additional question #1", open=False, elem_classes=["accordion"]) as aq_0_0:
578
+ depth_q_0 = gr.Markdown(f"### πŸ™‹πŸ™‹ {selected_paper['0_additional_depth_q:follow up question']}")
579
+ depth_q_eli5_0 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['0_additional_depth_q:answers:eli5']}", elem_classes=["small-font"])
580
+ depth_q_expert_0 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['0_additional_depth_q:answers:expert']}", visible=False, elem_classes=["small-font"])
581
+
582
+ with gr.Accordion("Additional question #2", open=False, elem_classes=["accordion"]) as aq_0_1:
583
+ breath_q_0 = gr.Markdown(f"### πŸ™‹πŸ™‹ {selected_paper['0_additional_breath_q:follow up question']}")
584
+ breath_q_eli5_0 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['0_additional_breath_q:answers:eli5']}", elem_classes=["small-font"])
585
+ breath_q_expert_0 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['0_additional_breath_q:answers:expert']}", visible=False, elem_classes=["small-font"])
586
+
587
+ # 2
588
+ with gr.Column(elem_classes=["group"], visible=True) as q_1:
589
+ basic_q_1 = gr.Markdown(f"### πŸ™‹ {selected_paper['1_question']}")
590
+ basic_q_eli5_1 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['1_answers:eli5']}", elem_classes=["small-font"])
591
+ basic_q_expert_1 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['1_answers:expert']}", visible=False, elem_classes=["small-font"])
592
+
593
+ with gr.Accordion("Additional question #1", open=False, elem_classes=["accordion"]) as aq_1_0:
594
+ depth_q_1 = gr.Markdown(f"### πŸ™‹πŸ™‹ {selected_paper['1_additional_depth_q:follow up question']}")
595
+ depth_q_eli5_1 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['1_additional_depth_q:answers:eli5']}", elem_classes=["small-font"])
596
+ depth_q_expert_1 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['1_additional_depth_q:answers:expert']}", visible=False, elem_classes=["small-font"])
597
+
598
+ with gr.Accordion("Additional question #2", open=False, elem_classes=["accordion"]) as aq_1_1:
599
+ breath_q_1 = gr.Markdown(f"### πŸ™‹πŸ™‹ {selected_paper['1_additional_breath_q:follow up question']}")
600
+ breath_q_eli5_1 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['1_additional_breath_q:answers:eli5']}", elem_classes=["small-font"])
601
+ breath_q_expert_1 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['1_additional_breath_q:answers:expert']}", visible=False, elem_classes=["small-font"])
602
+
603
+ # 3
604
+ with gr.Column(elem_classes=["group"], visible=True) as q_2:
605
+ basic_q_2 = gr.Markdown(f"### πŸ™‹ {selected_paper['2_question']}")
606
+ basic_q_eli5_2 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['2_answers:eli5']}", elem_classes=["small-font"])
607
+ basic_q_expert_2 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['2_answers:expert']}", visible=False, elem_classes=["small-font"])
608
+
609
+ with gr.Accordion("Additional question #1", open=False, elem_classes=["accordion"]) as aq_2_0:
610
+ depth_q_2 = gr.Markdown(f"### πŸ™‹πŸ™‹ {selected_paper['2_additional_depth_q:follow up question']}")
611
+ depth_q_eli5_2 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['2_additional_depth_q:answers:eli5']}", elem_classes=["small-font"])
612
+ depth_q_expert_2 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['2_additional_depth_q:answers:expert']}", visible=False, elem_classes=["small-font"])
613
+
614
+ with gr.Accordion("Additional question #2", open=False, elem_classes=["accordion"]) as aq_2_1:
615
+ breath_q_2 = gr.Markdown(f"### πŸ™‹πŸ™‹ {selected_paper['2_additional_breath_q:follow up question']}")
616
+ breath_q_eli5_2 = gr.Markdown(f"β†ͺ **(ELI5)** {selected_paper['2_additional_breath_q:answers:eli5']}", elem_classes=["small-font"])
617
+ breath_q_expert_2 = gr.Markdown(f"β†ͺ **(Technical)** {selected_paper['2_additional_breath_q:answers:expert']}", visible=False, elem_classes=["small-font"])
618
+
619
+ gr.Markdown("## Request any arXiv ids")
620
+ arxiv_queue = gr.Dataframe(
621
+ headers=["Requested arXiv IDs"], col_count=(1, "fixed"),
622
+ value=requested_arxiv_ids_df,
623
+ datatype=["str"],
624
+ interactive=False
625
+ )
626
+
627
+ arxiv_id_enter = gr.Textbox(placeholder="Enter comma separated arXiv IDs...", elem_classes=["textbox-no-label"])
628
+ arxiv_id_enter.submit(
629
+ add_arxiv_ids_to_queue,
630
+ [arxiv_queue, arxiv_id_enter],
631
+ arxiv_queue
632
+ )
633
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
634
 
635
  gr.Markdown("The target papers are collected from [Hugging Face πŸ€— Daily Papers](https://huggingface.co/papers) on a daily basis. "
636
  "The entire data is generated by [Google's Gemini 1.0](https://deepmind.google/technologies/gemini/) Pro. "
637
  "If you are curious how it is done, visit the [Auto Paper Q&A Generation project repository](https://github.com/deep-diver/auto-paper-analysis) "
638
  "Also, the generated dataset is hosted on Hugging Face πŸ€— Dataset repository as well([Link](https://huggingface.co/datasets/chansung/auto-paper-qa2)). ")
639
 
640
+ search_r1.click(set_date, search_r1, date_dd).then(
 
 
 
 
641
  set_papers,
642
  inputs=[date_dd, search_r1],
643
  outputs=[papers_dd, search_in]
644
  )
645
 
646
+ search_r2.click(set_date, search_r2, date_dd).then(
 
 
 
 
647
  set_papers,
648
  inputs=[date_dd, search_r2],
649
  outputs=[papers_dd, search_in]
650
  )
651
 
652
+ search_r3.click(set_date, search_r3, date_dd).then(
 
 
 
 
653
  set_papers,
654
  inputs=[date_dd, search_r3],
655
  outputs=[papers_dd, search_in]
656
  )
657
 
658
+ search_r4.click(set_date, search_r4, date_dd).then(
659
+ set_papers,
660
+ inputs=[date_dd, search_r4],
661
+ outputs=[papers_dd, search_in]
662
+ )
663
+
664
+ search_r5.click(set_date, search_r5, date_dd).then(
665
+ set_papers,
666
+ inputs=[date_dd, search_r5],
667
+ outputs=[papers_dd, search_in]
668
+ )
669
+
670
+ search_r6.click(set_date, search_r6, date_dd).then(
671
+ set_papers,
672
+ inputs=[date_dd, search_r6],
673
+ outputs=[papers_dd, search_in]
674
+ )
675
+
676
+ search_r7.click(set_date, search_r7, date_dd).then(
677
+ set_papers,
678
+ inputs=[date_dd, search_r7],
679
+ outputs=[papers_dd, search_in]
680
+ )
681
+
682
+ search_r8.click(set_date, search_r8, date_dd).then(
683
+ set_papers,
684
+ inputs=[date_dd, search_r8],
685
+ outputs=[papers_dd, search_in]
686
+ )
687
+
688
+ search_r9.click(set_date, search_r9, date_dd).then(
689
+ set_papers,
690
+ inputs=[date_dd, search_r9],
691
+ outputs=[papers_dd, search_in]
692
+ )
693
+
694
+ search_r10.click(set_date, search_r10, date_dd).then(
695
+ set_papers,
696
+ inputs=[date_dd, search_r10],
697
+ outputs=[papers_dd, search_in]
698
+ )
699
+
700
+ date_dd.input(get_papers, date_dd, papers_dd).then(
701
  set_paper,
702
  [date_dd, papers_dd],
703
  [
 
737
 
738
  search_in.change(
739
  inputs=[search_in],
740
+ outputs=[
741
+ search_r1, search_r2, search_r3, search_r4, search_r5,
742
+ search_r6, search_r7, search_r8, search_r9, search_r10
743
+ ],
744
  js=UPDATE_SEARCH_RESULTS,
745
  fn=None
746
  )
 
755
  ]
756
  )
757
 
758
+ conv_type.select(
759
+ inputs=[conv_type],
760
+ js=UPDATE_IF_TYPE,
761
+ outputs=None,
762
+ fn=None
763
+ )
764
+
765
+ start_date = datetime.now() + timedelta(minutes=1)
766
+ scheduler = BackgroundScheduler()
767
+ scheduler.add_job(
768
+ process_arxiv_ids,
769
+ trigger='interval',
770
+ seconds=3600,
771
+ args=[
772
+ gemini_api_key,
773
+ dataset_repo_id,
774
+ request_arxiv_repo_id,
775
+ hf_token
776
+ ],
777
+ start_date=start_date
778
+ )
779
+ scheduler.start()
780
+
781
+ demo.launch(share=True, debug=True)
constants/prompts.toml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [basic_qa]
2
+ prompt = """
3
+ come up with the 6 questions and answers that could be commonly asked by people about the following paper.
4
+ There should be two types of answers included, one for expert and the other for ELI5.
5
+ Your response should be recorded in a JSON format as ```json{"title": text, "summary": text, "qna": [{"question": "answers": {"eli5": text, "expert": text}}, ...]}```
6
+ """
7
+
8
+ [deep_qa]
9
+ prompt = """
10
+ Paper title: $title
11
+ Previous question: $previous_question
12
+ The answer on the previous question: $previous_answer
13
+
14
+ Based on the previous question and answer above, and based on the paper content below, suggest follow-up question and answers in $tone manner.
15
+ There should be two types of answers included, one for expert and the other for ELI5.
16
+ Your response should be recorded in a JSON format as ```json{"follow up question": text, "answers": {"eli5": text, "expert": text}}```
17
+ """
date_iterator.sh ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Set start and end dates (format YYYY-MM-DD)
4
+ start_date=$1
5
+ end_date=$2
6
+ hf_repo_id=$3
7
+
8
+ # Convert dates into seconds since epoch (for easier calculations)
9
+ start_seconds=$(date -j -f "%Y-%m-%d" "$start_date" "+%s")
10
+ end_seconds=$(date -j -f "%Y-%m-%d" "$end_date" "+%s")
11
+
12
+ # Iterate through dates
13
+ current_seconds=$start_seconds
14
+ while [[ $current_seconds -le $end_seconds ]]; do
15
+ current_date=$(date -j -r $current_seconds "+%Y-%m-%d")
16
+
17
+ # Replace with your actual program execution
18
+ echo "Running program for date: $current_date"
19
+ python app.py --target-date $current_date \
20
+ --gemini-api $GEMINI_API_KEY \
21
+ --hf-token $HF_ACCESS_TOKEN \
22
+ --hf-repo-id $hf_repo_id \
23
+ --hf-daily-papers
24
+
25
+ current_seconds=$((current_seconds + 86400)) # Add 1 day (86400 seconds)
26
+ done
27
+
gen/gemini.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import copy
3
+ import toml
4
+ from string import Template
5
+ from pathlib import Path
6
+ from flatdict import FlatDict
7
+ import google.generativeai as genai
8
+
9
+ from gen.utils import parse_first_json_snippet
10
+
11
+ def determine_model_name(given_image=None):
12
+ if given_image is None:
13
+ return "gemini-pro"
14
+ else:
15
+ return "gemini-pro-vision"
16
+
17
+ def construct_image_part(given_image):
18
+ return {
19
+ "mime_type": "image/jpeg",
20
+ "data": given_image
21
+ }
22
+
23
+ def call_gemini(prompt="", API_KEY=None, given_text=None, given_image=None, generation_config=None, safety_settings=None):
24
+ genai.configure(api_key=API_KEY)
25
+
26
+ if generation_config is None:
27
+ generation_config = {
28
+ "temperature": 0.8,
29
+ "top_p": 1,
30
+ "top_k": 32,
31
+ "max_output_tokens": 4096,
32
+ }
33
+
34
+ if safety_settings is None:
35
+ safety_settings = [
36
+ {
37
+ "category": "HARM_CATEGORY_HARASSMENT",
38
+ "threshold": "BLOCK_NONE"
39
+ },
40
+ {
41
+ "category": "HARM_CATEGORY_HATE_SPEECH",
42
+ "threshold": "BLOCK_NONE"
43
+ },
44
+ {
45
+ "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
46
+ "threshold": "BLOCK_NONE"
47
+ },
48
+ {
49
+ "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
50
+ "threshold": "BLOCK_NONE"
51
+ },
52
+ ]
53
+
54
+ model_name = determine_model_name(given_image)
55
+ model = genai.GenerativeModel(model_name=model_name,
56
+ generation_config=generation_config,
57
+ safety_settings=safety_settings)
58
+
59
+ USER_PROMPT = prompt
60
+ if given_text is not None:
61
+ USER_PROMPT += f"""{prompt}
62
+ ------------------------------------------------
63
+ {given_text}
64
+ """
65
+ prompt_parts = [USER_PROMPT]
66
+ if given_image is not None:
67
+ prompt_parts.append(construct_image_part(given_image))
68
+
69
+ response = model.generate_content(prompt_parts)
70
+ return response.text
71
+
72
+ def try_out(prompt, given_text, gemini_api_key, given_image=None, retry_num=5):
73
+ qna_json = None
74
+ cur_retry = 0
75
+
76
+ while qna_json is None and cur_retry < retry_num:
77
+ try:
78
+ qna = call_gemini(
79
+ prompt=prompt,
80
+ given_text=given_text,
81
+ given_image=given_image,
82
+ API_KEY=gemini_api_key
83
+ )
84
+
85
+ qna_json = parse_first_json_snippet(qna)
86
+ except Exception as e:
87
+ cur_retry = cur_retry + 1
88
+ print(f"......retry {e}")
89
+
90
+ return qna_json
91
+
92
+ def get_basic_qa(text, gemini_api_key, trucate=7000):
93
+ prompts = toml.load(Path('.') / 'constants' / 'prompts.toml')
94
+ basic_qa = try_out(prompts['basic_qa']['prompt'], text[:trucate], gemini_api_key=gemini_api_key)
95
+ return basic_qa
96
+
97
+
98
+ def get_deep_qa(text, basic_qa, gemini_api_key, trucate=7000):
99
+ prompts = toml.load(Path('.') / 'constants' / 'prompts.toml')
100
+
101
+ title = basic_qa['title']
102
+ qnas = copy.deepcopy(basic_qa['qna'])
103
+
104
+ for idx, qna in enumerate(qnas):
105
+ q = qna['question']
106
+ a_expert = qna['answers']['expert']
107
+
108
+ depth_search_prompt = Template(prompts['deep_qa']['prompt']).substitute(
109
+ title=title, previous_question=q, previous_answer=a_expert, tone="in-depth"
110
+ )
111
+ breath_search_prompt = Template(prompts['deep_qa']['prompt']).substitute(
112
+ title=title, previous_question=q, previous_answer=a_expert, tone="broad"
113
+ )
114
+
115
+ depth_search_response = {}
116
+ breath_search_response = {}
117
+
118
+ while 'follow up question' not in depth_search_response or \
119
+ 'answers' not in depth_search_response or \
120
+ 'eli5' not in depth_search_response['answers'] or \
121
+ 'expert' not in depth_search_response['answers']:
122
+ depth_search_response = try_out(depth_search_prompt, text[:trucate], gemini_api_key=gemini_api_key)
123
+
124
+ while 'follow up question' not in breath_search_response or \
125
+ 'answers' not in breath_search_response or \
126
+ 'eli5' not in breath_search_response['answers'] or \
127
+ 'expert' not in breath_search_response['answers']:
128
+ breath_search_response = try_out(breath_search_prompt, text[:trucate], gemini_api_key=gemini_api_key)
129
+
130
+ if depth_search_response is not None:
131
+ qna['additional_depth_q'] = depth_search_response
132
+ if breath_search_response is not None:
133
+ qna['additional_breath_q'] = breath_search_response
134
+
135
+ qna = FlatDict(qna)
136
+ qna_tmp = copy.deepcopy(qna)
137
+ for k in qna_tmp:
138
+ value = qna.pop(k)
139
+ qna[f'{idx}_{k}'] = value
140
+ basic_qa.update(ast.literal_eval(str(qna)))
141
+
142
+ return basic_qa
gen/utils.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ def find_json_snippet(raw_snippet):
4
+ json_parsed_string = None
5
+
6
+ json_start_index = raw_snippet.find('{')
7
+ json_end_index = raw_snippet.rfind('}')
8
+
9
+ if json_start_index >= 0 and json_end_index >= 0:
10
+ json_snippet = raw_snippet[json_start_index:json_end_index+1]
11
+ try:
12
+ json_parsed_string = json.loads(json_snippet, strict=False)
13
+ except:
14
+ raise ValueError('......failed to parse string into JSON format')
15
+ else:
16
+ raise ValueError('......No JSON code snippet found in string.')
17
+
18
+ return json_parsed_string
19
+
20
+ def parse_first_json_snippet(snippet):
21
+ json_parsed_string = None
22
+
23
+ if isinstance(snippet, list):
24
+ for snippet_piece in snippet:
25
+ try:
26
+ json_parsed_string = find_json_snippet(snippet_piece)
27
+ return json_parsed_string
28
+ except:
29
+ pass
30
+ else:
31
+ try:
32
+ json_parsed_string = find_json_snippet(snippet)
33
+ except Exception as e:
34
+ print(e)
35
+ raise ValueError()
36
+
37
+ return json_parsed_string
outputs.json ADDED
The diff for this file is too large to render. See raw diff
 
paper/download.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import requests
4
+ import datetime
5
+ from datetime import date
6
+ from datetime import datetime
7
+ import xml.etree.ElementTree as ET
8
+ from requests.exceptions import HTTPError
9
+
10
+ def _get_today():
11
+ return str(date.today())
12
+
13
+ def _download_pdf_from_arxiv(filename):
14
+ url = f'https://arxiv.org/pdf/{filename}'
15
+ response = requests.get(url)
16
+ if response.status_code == 200:
17
+ return response.content
18
+ else:
19
+ raise Exception(f"Failed to download pdf for arXiv id {filename}")
20
+
21
+ def download_pdf_from_arxiv(arxiv_id):
22
+ filename = f"{arxiv_id}.pdf"
23
+ pdf_content = _download_pdf_from_arxiv(filename)
24
+
25
+ # Save the pdf content to a file
26
+ with open(filename, "wb") as f:
27
+ f.write(pdf_content)
28
+
29
+ return filename
30
+
31
+ def _get_papers_from_hf_daily_papers(target_date):
32
+ if target_date is None:
33
+ target_date = _get_today()
34
+ print(f"target_date is not set => scrap today's papers [{target_date}]")
35
+ url = f"https://huggingface.co/api/daily_papers?date={target_date}"
36
+
37
+ response = requests.get(url)
38
+
39
+ if response.status_code == 200:
40
+ return target_date, response.text
41
+ else:
42
+ raise HTTPError(f"Error fetching data. Status code: {response.status_code}")
43
+
44
+ def get_papers_from_hf_daily_papers(target_date):
45
+ target_date, results = _get_papers_from_hf_daily_papers(target_date)
46
+ results = json.loads(results)
47
+ for result in results:
48
+ result["target_date"] = target_date
49
+ return target_date, results
50
+
51
+
52
+ def _get_paper_xml_by_arxiv_id(arxiv_id):
53
+ url = f"http://export.arxiv.org/api/query?search_query=id:{arxiv_id}&start=0&max_results=1"
54
+ return requests.get(url)
55
+
56
+ def _is_arxiv_id_valid(arxiv_id):
57
+ pattern = r"^\d{4}\.\d{5}$"
58
+ return bool(re.match(pattern, arxiv_id))
59
+
60
+ def _get_paper_metadata_by_arxiv_id(response):
61
+ root = ET.fromstring(response.content)
62
+
63
+ # Example: Extracting title, authors, and abstract
64
+ title = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}title').text
65
+ authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in root.findall('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}author')]
66
+ abstract = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}summary').text
67
+ target_date = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}published').text
68
+
69
+ return title, authors, abstract, target_date
70
+
71
+ def get_papers_from_arxiv_ids(arxiv_ids):
72
+ results = []
73
+
74
+ for arxiv_id in arxiv_ids:
75
+ print(arxiv_id)
76
+ if _is_arxiv_id_valid(arxiv_id):
77
+ try:
78
+ xml_data = _get_paper_xml_by_arxiv_id(arxiv_id)
79
+ title, authors, abstract, target_date = _get_paper_metadata_by_arxiv_id(xml_data)
80
+
81
+ datetime_obj = datetime.strptime(target_date, "%Y-%m-%dT%H:%M:%SZ")
82
+ formatted_date = datetime_obj.strftime("%Y-%m-%d")
83
+
84
+ results.append(
85
+ {
86
+ "title": title,
87
+ "target_date": formatted_date,
88
+ "paper": {
89
+ "summary": abstract,
90
+ "id": arxiv_id,
91
+ "authors" : authors,
92
+ }
93
+ }
94
+ )
95
+ except:
96
+ print("......something wrong happend when downloading metadata")
97
+ print("......this usually happens when you try out the today's published paper")
98
+ continue
99
+ else:
100
+ print(f"......not a valid arXiv ID[{arxiv_id}]")
101
+
102
+ return results
paper/parser.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fitz
3
+ import PyPDF2
4
+
5
+ def extract_text_and_figures(pdf_path):
6
+ """
7
+ Extracts text and figures from a PDF file.
8
+
9
+ Args:
10
+ pdf_path (str): The path to the PDF file.
11
+
12
+ Returns:
13
+ tuple: A tuple containing two lists:
14
+ * A list of extracted text blocks.
15
+ * A list of extracted figures (as bytes).
16
+ """
17
+
18
+ texts = []
19
+ figures = []
20
+
21
+ # Open the PDF using PyMuPDF (fitz) for image extraction
22
+ doc = fitz.open(pdf_path)
23
+ for page_num, page in enumerate(doc):
24
+ text = page.get_text("text") # Extract text as plain text
25
+ texts.append(text)
26
+
27
+ # Process images on the page
28
+ image_list = page.get_images()
29
+ for image_index, img in enumerate(image_list):
30
+ xref = img[0] # Image XREF
31
+ pix = fitz.Pixmap(doc, xref) # Create Pixmap image
32
+
33
+ # Save image in desired format (here, PNG)
34
+ if pix.n < 5: # Grayscale or RGB
35
+ img_bytes = pix.tobytes("png")
36
+ else: # CMYK: Convert to RGB first
37
+ pix = fitz.Pixmap(fitz.csRGB, pix)
38
+ img_bytes = pix.tobytes("png")
39
+
40
+ figures.append(img_bytes)
41
+
42
+ # Extract additional text using PyPDF2 (in case fitz didn't get everything)
43
+ with open(pdf_path, 'rb') as pdf_file:
44
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
45
+ for page_num in range(len(pdf_reader.pages)):
46
+ page = pdf_reader.pages[page_num]
47
+ text = page.extract_text()
48
+ texts.append(text)
49
+
50
+ try:
51
+ os.remove(pdf_path)
52
+ except FileNotFoundError:
53
+ print(f"File '{pdf_path}' not found.")
54
+ except PermissionError:
55
+ print(f"Unable to remove '{pdf_path}'. Check permissions.")
56
+
57
+ return texts, figures
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ google-generativeai
2
+ pypdf2
3
+ PyMuPDF
4
+ gradio
5
+ requests
6
+ toml
7
+ datasets
8
+ flatdict
9
+ APScheduler
utils.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import datasets
3
+ from datasets import Dataset
4
+ from huggingface_hub import create_repo
5
+ from huggingface_hub.utils import HfHubHTTPError
6
+
7
+ def push_to_hf_hub(
8
+ qnas, repo_id, token, append=True
9
+ ):
10
+ print(1)
11
+ exist = False
12
+ df = pd.DataFrame([qnas])
13
+ ds = Dataset.from_pandas(df)
14
+ ds = ds.cast_column("target_date", datasets.features.Value("timestamp[s]"))
15
+
16
+ print(2)
17
+ try:
18
+ create_repo(repo_id, repo_type="dataset", token=token)
19
+ except HfHubHTTPError as e:
20
+ exist = True
21
+
22
+ if exist and append:
23
+ print(3)
24
+ existing_ds = datasets.load_dataset(repo_id)
25
+ ds = datasets.concatenate_datasets([existing_ds['train'], ds])
26
+
27
+ print(4)
28
+ ds.push_to_hub(repo_id, token=token)