Kadi-IAM commited on
Commit
26d01d9
·
verified ·
1 Parent(s): 87721bf

Upload 5 files

Browse files
Files changed (3) hide show
  1. README.md +4 -0
  2. app.py +31 -19
  3. json2kadi.py +90 -153
README.md CHANGED
@@ -10,3 +10,7 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+
14
+ Demo: https://huggingface.co/spaces/Kadi-IAM/KadiTextract
15
+
16
+ A simple web app to obtain structured output from text input using Large Language Models (LLMs).
app.py CHANGED
@@ -1,14 +1,20 @@
 
 
 
 
 
1
  import os
2
  import json
3
  import gradio as gr
4
  import groq
5
  from difflib import Differ
6
- from json2kadi import transform_value, my_json_to_kadi
7
  from kadi_apy.lib.conversion import json_to_kadi
8
 
9
  # Set api key of Groq
10
  api_key = os.getenv("GROQ_API")
11
 
 
12
  example_1 = (
13
  """John B. Goodenough (1922–2023) was a renowned American physicist and materials scientist,
14
  best known for his pioneering work in developing the lithium-ion battery. He earned a Ph.D. in physics from
@@ -141,7 +147,11 @@ example_4 = (
141
  """,
142
  )
143
 
 
144
  def generate_response(prompt):
 
 
 
145
  if not prompt:
146
  return "No transcription available. Please try speaking again."
147
 
@@ -162,6 +172,8 @@ def generate_response(prompt):
162
 
163
 
164
  def post_process_output(output):
 
 
165
  # 1. remove json mark
166
  output = output.replace("```", "")
167
  output = output.replace("null", '""')
@@ -172,6 +184,7 @@ def post_process_output(output):
172
  return output
173
 
174
 
 
175
  extract_info_prompt = """
176
  You are an data scientist, extract information from text with given template in json format. Do not add any explanation.
177
 
@@ -184,6 +197,8 @@ Template:
184
 
185
 
186
  def extract_info(input_text, structure_template):
 
 
187
  # validate structure_template is json
188
  try:
189
  structure_template = json.dumps(json.loads(structure_template), indent=4)
@@ -202,11 +217,13 @@ def extract_info(input_text, structure_template):
202
  except Exception as e:
203
  print("Error in json format, retrying...")
204
  continue
205
-
206
  return structured_output
207
 
208
 
209
  def diff_texts(text1, text2):
 
 
210
  d = Differ()
211
  return [
212
  (token[2:], token[0] if token[0] != " " else None)
@@ -215,6 +232,8 @@ def diff_texts(text1, text2):
215
 
216
 
217
  def transform_json_to_kadi_schema(input_json_str):
 
 
218
  input_json = json.loads(input_json_str)
219
  try:
220
  output_json = my_json_to_kadi(input_json)
@@ -225,6 +244,7 @@ def transform_json_to_kadi_schema(input_json_str):
225
  return json.dumps(output_json, indent=2)
226
 
227
 
 
228
  example_structure_template = """
229
  {
230
  "Material": {
@@ -252,7 +272,10 @@ example_structure_template = """
252
  """
253
 
254
 
 
255
  def suggest_template(input_text):
 
 
256
  if not input_text.strip():
257
  raise gr.Error("The input text should not be empty.")
258
  combined_prompt = f"""
@@ -282,6 +305,7 @@ def suggest_template(input_text):
282
  return output
283
 
284
 
 
285
  with gr.Blocks() as demo:
286
  gr.Markdown(
287
  "### A simple web app to obtain structured output from text input using Large Language Models (LLMs)."
@@ -305,16 +329,6 @@ with gr.Blocks() as demo:
305
  placeholder="Enter your structure template here.",
306
  )
307
 
308
- # with gr.Accordion("Show detailed writing instruction", open=False):
309
- # gr.Markdown(
310
- # "Note: modify **[topic]** in writing instruction accordingly."
311
- # )
312
- # prompt_input = gr.Textbox(
313
- # label="Writing instruction",
314
- # value="I am writing a paper on [topic] for a leading academic journal and would like help refining a specific section. Please rephrase the section to enhance clarity, coherence, and conciseness, ensuring smooth transitions between paragraphs and logical flow. Remove any unnecessary jargon and maintain a formal, professional tone suitable for an academic audience.",
315
- # lines=5,
316
- # )
317
-
318
  with gr.Row():
319
  suggest_btn = gr.Button("Suggest template", scale=1)
320
  submit_btn = gr.Button("Extract", variant="primary", scale=2)
@@ -322,12 +336,6 @@ with gr.Blocks() as demo:
322
  with gr.Column():
323
  output = gr.Textbox(label="Structured Output", show_copy_button=True)
324
  with gr.Accordion("Show Kadi-compatible output", open=False):
325
- # output_diff = gr.HighlightedText(
326
- # label="Diff",
327
- # combine_adjacent=True,
328
- # show_legend=True,
329
- # color_map={"-": "red", "+": "green"},
330
- # )
331
  output_kadi = gr.Textbox(
332
  label="Kadi compatible metadata output",
333
  lines=5,
@@ -335,9 +343,12 @@ with gr.Blocks() as demo:
335
  )
336
 
337
  gr.Markdown()
338
- gr.Markdown("Add metadata by copying and pasting in [Kadi](https://kadi.iam.kit.edu/) Record")
 
 
339
  gr.Markdown("![](file/copy_to_kadi.png)")
340
 
 
341
  submit_btn.click(
342
  fn=extract_info, inputs=[text_input, structure_template], outputs=output
343
  )
@@ -349,6 +360,7 @@ with gr.Blocks() as demo:
349
  fn=transform_json_to_kadi_schema, inputs=[output], outputs=output_kadi
350
  )
351
 
 
352
  gr.Markdown()
353
  gr.Markdown()
354
  gr.Markdown()
 
1
+ """
2
+ This application demo shows how to extract structured information using LLMs
3
+ and transfer it as metadata in Kadi.
4
+ """
5
+
6
  import os
7
  import json
8
  import gradio as gr
9
  import groq
10
  from difflib import Differ
11
+ from json2kadi import my_json_to_kadi
12
  from kadi_apy.lib.conversion import json_to_kadi
13
 
14
  # Set api key of Groq
15
  api_key = os.getenv("GROQ_API")
16
 
17
+ # Examples
18
  example_1 = (
19
  """John B. Goodenough (1922–2023) was a renowned American physicist and materials scientist,
20
  best known for his pioneering work in developing the lithium-ion battery. He earned a Ph.D. in physics from
 
147
  """,
148
  )
149
 
150
+
151
  def generate_response(prompt):
152
+ """
153
+ Get response (structured json) from LLMs.
154
+ """
155
  if not prompt:
156
  return "No transcription available. Please try speaking again."
157
 
 
172
 
173
 
174
  def post_process_output(output):
175
+ """Clean up output."""
176
+
177
  # 1. remove json mark
178
  output = output.replace("```", "")
179
  output = output.replace("null", '""')
 
184
  return output
185
 
186
 
187
+ # Basic prompt for extraction
188
  extract_info_prompt = """
189
  You are an data scientist, extract information from text with given template in json format. Do not add any explanation.
190
 
 
197
 
198
 
199
  def extract_info(input_text, structure_template):
200
+ """Extract structured output from text input."""
201
+
202
  # validate structure_template is json
203
  try:
204
  structure_template = json.dumps(json.loads(structure_template), indent=4)
 
217
  except Exception as e:
218
  print("Error in json format, retrying...")
219
  continue
220
+
221
  return structured_output
222
 
223
 
224
  def diff_texts(text1, text2):
225
+ """Compare two text inputs."""
226
+
227
  d = Differ()
228
  return [
229
  (token[2:], token[0] if token[0] != " " else None)
 
232
 
233
 
234
  def transform_json_to_kadi_schema(input_json_str):
235
+ """Tranform json into Kadi metadata schema."""
236
+
237
  input_json = json.loads(input_json_str)
238
  try:
239
  output_json = my_json_to_kadi(input_json)
 
244
  return json.dumps(output_json, indent=2)
245
 
246
 
247
+ # Baisc template for inferring json template
248
  example_structure_template = """
249
  {
250
  "Material": {
 
272
  """
273
 
274
 
275
+ # Infer template from text input based on exmaple template defined above
276
  def suggest_template(input_text):
277
+ """Infer structured template from text input."""
278
+
279
  if not input_text.strip():
280
  raise gr.Error("The input text should not be empty.")
281
  combined_prompt = f"""
 
305
  return output
306
 
307
 
308
+ # Graio UI
309
  with gr.Blocks() as demo:
310
  gr.Markdown(
311
  "### A simple web app to obtain structured output from text input using Large Language Models (LLMs)."
 
329
  placeholder="Enter your structure template here.",
330
  )
331
 
 
 
 
 
 
 
 
 
 
 
332
  with gr.Row():
333
  suggest_btn = gr.Button("Suggest template", scale=1)
334
  submit_btn = gr.Button("Extract", variant="primary", scale=2)
 
336
  with gr.Column():
337
  output = gr.Textbox(label="Structured Output", show_copy_button=True)
338
  with gr.Accordion("Show Kadi-compatible output", open=False):
 
 
 
 
 
 
339
  output_kadi = gr.Textbox(
340
  label="Kadi compatible metadata output",
341
  lines=5,
 
343
  )
344
 
345
  gr.Markdown()
346
+ gr.Markdown(
347
+ "Add metadata by copying and pasting in [Kadi](https://kadi.iam.kit.edu/) Record"
348
+ )
349
  gr.Markdown("![](file/copy_to_kadi.png)")
350
 
351
+ # Actions
352
  submit_btn.click(
353
  fn=extract_info, inputs=[text_input, structure_template], outputs=output
354
  )
 
360
  fn=transform_json_to_kadi_schema, inputs=[output], outputs=output_kadi
361
  )
362
 
363
+ # Placeholder
364
  gr.Markdown()
365
  gr.Markdown()
366
  gr.Markdown()
json2kadi.py CHANGED
@@ -1,46 +1,43 @@
1
  import json
2
 
 
 
3
  def transform_value(key, value):
4
  if isinstance(value, dict):
5
- if 'Value' in value and 'Unit' in value:
6
- value_type = "str" if isinstance(value['Value'], str) else "float"
7
  return {
8
  "key": key,
9
  "type": "dict",
10
  "value": [
11
- {"key": "Value", "type": value_type, "value": value['Value']},
12
- {"key": "Unit", "type": "str", "value": value['Unit']}
13
- ]
14
  }
15
  else:
16
  return {
17
  "key": key,
18
  "type": "dict",
19
- "value": [transform_value(k, v) for k, v in value.items()]
20
  }
21
  elif isinstance(value, list):
22
  return {
23
  "key": key,
24
  "type": "list",
25
- "value": [transform_value("", item) for item in value]
26
  }
27
  elif isinstance(value, str):
28
- return {
29
- "key": key,
30
- "type": "str",
31
- "value": value
32
- }
33
  else:
34
  raise ValueError(f"Unsupported value type: {type(value)}")
35
 
 
36
  def my_json_to_kadi(data):
37
  return [transform_value(key, value) for key, value in data.items()]
38
 
39
 
40
-
41
  # Print the output JSON in a formatted way
42
-
43
- # Example JSON input
44
  input_json = {
45
  "Material": {
46
  "Name": "LLTO",
@@ -52,156 +49,96 @@ input_json = {
52
  "Dendrite Formation Risk": "",
53
  "Operating Voltage": "",
54
  "Flexibility": "",
55
- "Processing": ""
56
- }
57
  },
58
  "Performance": {
59
  "Specific Capacity": {"Value": "", "Unit": ""},
60
  "Energy Density": {"Value": "", "Unit": ""},
61
  "Capacity Retention": "",
62
- "Operating Temperature": {"Value": "Room temperature", "Unit": ""}
63
  },
64
- "Usage": {
65
- "Battery Type": "",
66
- "Benefits": []
67
- }
68
  }
69
 
70
- # input_json = {
71
- # "Doctor_Patient_Discussion": {
72
- # "Initial_Observation": {
73
- # "Symptoms": [
74
- # "pale",
75
- # "sore throat",
76
- # "running a temperature"
77
- # ],
78
- # "Initial_Assessment": "You\u2019ve moderate fever."
79
- # },
80
- # "Medical_Examination": {
81
- # "Temperature": "99.8",
82
- # "Blood_Pressure": "fine",
83
- # "Doctor_Assessment": "few symptoms of malaria",
84
- # "Diagnosis": "few symptoms of malaria"
85
- # },
86
- # "Treatment_Plan": {
87
- # "Prescription": [
88
- # "three medicines",
89
- # "a syrup"
90
- # ]
91
- # }
92
- # }
93
- # }
94
-
95
- # input_json = {
96
- # "Doctor_Patient_Discussion": {
97
- # "Initial_Observation": {
98
- # "Symptoms": [
99
- # "pale",
100
- # "sore throat",
101
- # "running a temperature"
102
- # ],
103
- # "Initial_Assessment": "You\u2019ve moderate fever."
104
- # },
105
- # "Medical_Examination": {
106
- # "Temperature": "99.8",
107
- # "Blood_Pressure": "fine",
108
- # "Doctor_Assessment": "few symptoms of malaria",
109
- # "Diagnosis": "few symptoms of malaria"
110
- # },
111
- # "Treatment_Plan": {
112
- # "Prescription": [
113
- # "three medicines",
114
- # "a syrup"
115
- # ]
116
- # }
117
- # }
118
- # }
119
-
120
  input_json = {
121
- "Experiment": {
122
- "Material": "LATP powders",
123
- "SynthesisRoute": "modified sol-gel synthesis route described by (Bucharsky et al., 2015)",
124
- "Precursors": [
125
- {
126
- "Name": "lithium acetate Li(C2H3O2) ⋅2H2O",
127
- "Purity": "purity ≥ 99 %",
128
- "Supplier": "Alfa Aesar GmbH & Co KG",
129
- "Location": "Germany"
130
- },
131
- {
132
- "Name": "aluminum nitrate Al(NO3)3 ⋅9H2O",
133
- "Purity": "purity ≥ 98.5 %",
134
- "Supplier": "Merck KGaA",
135
- "Location": "Germany"
136
- },
137
- {
138
- "Name": "titanium-isopropoxide Ti[OCH(CH3)2]4",
139
- "Purity": "purity ≥ 98 %",
140
- "Supplier": "Merck KGaA",
141
- "Location": "Germany"
142
- }
143
- ],
144
- "Procedure": [
145
- {
146
- "Step": "Dissolve lithium acetate and aluminum nitrate in distilled water under constant stirring."
147
- },
148
- {
149
- "Step": "Add titanium-isopropoxide dropwise to the solution."
150
- },
151
- {
152
- "Step": "Add phosphoric acid slowly through a drip funnel to form a gel."
153
- },
154
- {
155
- "Step": "Dry the gel at room temperature for 24 h."
156
- }
157
- ],
158
- "HeatTreatment": [
159
- {
160
- "Step": "First, heat treat samples at 400°C for 6 h to achieve precursor formation and eliminate reaction gases."
161
- },
162
- {
163
- "Step": "Second, process samples at 900°C for 8 h to complete the reaction to crystalline LATP."
164
- }
165
- ],
166
- "BatchVariations": [
167
- {
168
- "Description": "Prepare one batch with all precursors in stoichiometric quantities (marked as 0.0 wt%)."
169
- },
170
- {
171
- "Description": "Explore different batches with either an excess up to +7.5 wt% or a deficiency up to -15.0 wt% of phosphoric acid compared to the stoichiometric composition."
172
- }
173
- ],
174
- "Processing": [
175
- {
176
- "Step": "Process the obtained powders in a planetary ball mill."
177
- },
178
- {
179
- "Step": "Form pellets by uniaxial pressing and then further densify by cold isostatic pressing at 400 MPa."
180
- },
181
- {
182
- "Step": "All pressed samples have a green density of approximately 62% relative density."
183
- }
184
- ],
185
- "Sintering": {
186
- "TemperatureRange": "850 to 1,050°C",
187
- "IsothermalSinteringTime": "30 to 540 min",
188
- "Cooling": "Cool down to room temperature in furnace",
189
- "DensityDetermination": "Determine densities by Archimedes’ method"
190
- },
191
- "IonicConductivityMeasurements": {
192
- "Method": "Impedance analysis",
193
- "Conditions": "At room temperature over the frequency range from 0.1 Hz to 1 MHz with an AC amplitude of 50 mV in the frequency response analyzer (AMTEK GmbH, VersaSTAT 4, Pennsylvania, United States)",
194
- "Reference": "For further details of the experimental part please refer to our previous work (Schiffmann et al., 2021)"
195
  }
196
- }
197
  }
198
 
199
 
200
  if __name__ == "__main__":
201
- # Transform the input JSON
202
- # output_json = transform_json2kadi(input_json)
203
- from kadi_apy.lib.conversion import json_to_kadi
204
- output_json = json_to_kadi(input_json)
205
 
206
- # Print the output JSON
207
- print(json.dumps(output_json, indent=2))
 
1
  import json
2
 
3
+
4
+ # Transform value in metadata
5
  def transform_value(key, value):
6
  if isinstance(value, dict):
7
+ if "Value" in value and "Unit" in value:
8
+ value_type = "str" if isinstance(value["Value"], str) else "float"
9
  return {
10
  "key": key,
11
  "type": "dict",
12
  "value": [
13
+ {"key": "Value", "type": value_type, "value": value["Value"]},
14
+ {"key": "Unit", "type": "str", "value": value["Unit"]},
15
+ ],
16
  }
17
  else:
18
  return {
19
  "key": key,
20
  "type": "dict",
21
+ "value": [transform_value(k, v) for k, v in value.items()],
22
  }
23
  elif isinstance(value, list):
24
  return {
25
  "key": key,
26
  "type": "list",
27
+ "value": [transform_value("", item) for item in value],
28
  }
29
  elif isinstance(value, str):
30
+ return {"key": key, "type": "str", "value": value}
 
 
 
 
31
  else:
32
  raise ValueError(f"Unsupported value type: {type(value)}")
33
 
34
+
35
  def my_json_to_kadi(data):
36
  return [transform_value(key, value) for key, value in data.items()]
37
 
38
 
 
39
  # Print the output JSON in a formatted way
40
+ # Some example JSON inputs for testing
 
41
  input_json = {
42
  "Material": {
43
  "Name": "LLTO",
 
49
  "Dendrite Formation Risk": "",
50
  "Operating Voltage": "",
51
  "Flexibility": "",
52
+ "Processing": "",
53
+ },
54
  },
55
  "Performance": {
56
  "Specific Capacity": {"Value": "", "Unit": ""},
57
  "Energy Density": {"Value": "", "Unit": ""},
58
  "Capacity Retention": "",
59
+ "Operating Temperature": {"Value": "Room temperature", "Unit": ""},
60
  },
61
+ "Usage": {"Battery Type": "", "Benefits": []},
 
 
 
62
  }
63
 
64
+ # Another test
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  input_json = {
66
+ "Experiment": {
67
+ "Material": "LATP powders",
68
+ "SynthesisRoute": "modified sol-gel synthesis route described by (Bucharsky et al., 2015)",
69
+ "Precursors": [
70
+ {
71
+ "Name": "lithium acetate Li(C2H3O2) ⋅2H2O",
72
+ "Purity": "purity ≥ 99 %",
73
+ "Supplier": "Alfa Aesar GmbH & Co KG",
74
+ "Location": "Germany",
75
+ },
76
+ {
77
+ "Name": "aluminum nitrate Al(NO3)3 ⋅9H2O",
78
+ "Purity": "purity ≥ 98.5 %",
79
+ "Supplier": "Merck KGaA",
80
+ "Location": "Germany",
81
+ },
82
+ {
83
+ "Name": "titanium-isopropoxide Ti[OCH(CH3)2]4",
84
+ "Purity": "purity ≥ 98 %",
85
+ "Supplier": "Merck KGaA",
86
+ "Location": "Germany",
87
+ },
88
+ ],
89
+ "Procedure": [
90
+ {
91
+ "Step": "Dissolve lithium acetate and aluminum nitrate in distilled water under constant stirring."
92
+ },
93
+ {"Step": "Add titanium-isopropoxide dropwise to the solution."},
94
+ {"Step": "Add phosphoric acid slowly through a drip funnel to form a gel."},
95
+ {"Step": "Dry the gel at room temperature for 24 h."},
96
+ ],
97
+ "HeatTreatment": [
98
+ {
99
+ "Step": "First, heat treat samples at 400°C for 6 h to achieve precursor formation and eliminate reaction gases."
100
+ },
101
+ {
102
+ "Step": "Second, process samples at 900°C for 8 h to complete the reaction to crystalline LATP."
103
+ },
104
+ ],
105
+ "BatchVariations": [
106
+ {
107
+ "Description": "Prepare one batch with all precursors in stoichiometric quantities (marked as 0.0 wt%)."
108
+ },
109
+ {
110
+ "Description": "Explore different batches with either an excess up to +7.5 wt% or a deficiency up to -15.0 wt% of phosphoric acid compared to the stoichiometric composition."
111
+ },
112
+ ],
113
+ "Processing": [
114
+ {"Step": "Process the obtained powders in a planetary ball mill."},
115
+ {
116
+ "Step": "Form pellets by uniaxial pressing and then further densify by cold isostatic pressing at 400 MPa."
117
+ },
118
+ {
119
+ "Step": "All pressed samples have a green density of approximately 62% relative density."
120
+ },
121
+ ],
122
+ "Sintering": {
123
+ "TemperatureRange": "850 to 1,050°C",
124
+ "IsothermalSinteringTime": "30 to 540 min",
125
+ "Cooling": "Cool down to room temperature in furnace",
126
+ "DensityDetermination": "Determine densities by Archimedes’ method",
127
+ },
128
+ "IonicConductivityMeasurements": {
129
+ "Method": "Impedance analysis",
130
+ "Conditions": "At room temperature over the frequency range from 0.1 Hz to 1 MHz with an AC amplitude of 50 mV in the frequency response analyzer (AMTEK GmbH, VersaSTAT 4, Pennsylvania, United States)",
131
+ "Reference": "For further details of the experimental part please refer to our previous work (Schiffmann et al., 2021)",
132
+ },
 
 
 
 
 
 
 
133
  }
 
134
  }
135
 
136
 
137
  if __name__ == "__main__":
138
+ # Transform the input JSON
139
+ from kadi_apy.lib.conversion import json_to_kadi
140
+
141
+ output_json = json_to_kadi(input_json)
142
 
143
+ # Print the output JSON
144
+ print(json.dumps(output_json, indent=2))