Spaces:

Kadi-IAM
/

KadiTextract

Sleeping

App Files Files Community

Kadi-IAM commited on Oct 28, 2024

Commit

26d01d9

verified ·

1 Parent(s): 87721bf

Upload 5 files

Browse files

Files changed (3) hide show

README.md +4 -0
app.py +31 -19
json2kadi.py +90 -153

README.md CHANGED Viewed

@@ -10,3 +10,7 @@ pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+Demo: https://huggingface.co/spaces/Kadi-IAM/KadiTextract
+A simple web app to obtain structured output from text input using Large Language Models (LLMs).

app.py CHANGED Viewed

@@ -1,14 +1,20 @@
 import os
 import json
 import gradio as gr
 import groq
 from difflib import Differ
-from json2kadi import transform_value, my_json_to_kadi
 from kadi_apy.lib.conversion import json_to_kadi
 # Set api key of Groq
 api_key = os.getenv("GROQ_API")
 example_1 = (
     """John B. Goodenough (1922–2023) was a renowned American physicist and materials scientist,
 best known for his pioneering work in developing the lithium-ion battery. He earned a Ph.D. in physics from
@@ -141,7 +147,11 @@ example_4 = (
 """,
 )
 def generate_response(prompt):
     if not prompt:
         return "No transcription available. Please try speaking again."
@@ -162,6 +172,8 @@ def generate_response(prompt):
 def post_process_output(output):
     # 1. remove json mark
     output = output.replace("```", "")
     output = output.replace("null", '""')
@@ -172,6 +184,7 @@ def post_process_output(output):
     return output
 extract_info_prompt = """
 You are an data scientist, extract information from text with given template in json format. Do not add any explanation.
@@ -184,6 +197,8 @@ Template:
 def extract_info(input_text, structure_template):
     # validate structure_template is json
     try:
         structure_template = json.dumps(json.loads(structure_template), indent=4)
@@ -202,11 +217,13 @@ def extract_info(input_text, structure_template):
         except Exception as e:
             print("Error in json format, retrying...")
             continue
     return structured_output
 def diff_texts(text1, text2):
     d = Differ()
     return [
         (token[2:], token[0] if token[0] != " " else None)
@@ -215,6 +232,8 @@ def diff_texts(text1, text2):
 def transform_json_to_kadi_schema(input_json_str):
     input_json = json.loads(input_json_str)
     try:
         output_json = my_json_to_kadi(input_json)
@@ -225,6 +244,7 @@ def transform_json_to_kadi_schema(input_json_str):
     return json.dumps(output_json, indent=2)
 example_structure_template = """
 {
     "Material": {
@@ -252,7 +272,10 @@ example_structure_template = """
 """
 def suggest_template(input_text):
     if not input_text.strip():
         raise gr.Error("The input text should not be empty.")
     combined_prompt = f"""
@@ -282,6 +305,7 @@ def suggest_template(input_text):
     return output
 with gr.Blocks() as demo:
     gr.Markdown(
         "### A simple web app to obtain structured output from text input using Large Language Models (LLMs)."
@@ -305,16 +329,6 @@ with gr.Blocks() as demo:
                     placeholder="Enter your structure template here.",
                 )
-                # with gr.Accordion("Show detailed writing instruction", open=False):
-                #     gr.Markdown(
-                #         "Note: modify **[topic]** in writing instruction accordingly."
-                #     )
-                #     prompt_input = gr.Textbox(
-                #         label="Writing instruction",
-                #         value="I am writing a paper on [topic] for a leading academic journal and would like help refining a specific section. Please rephrase the section to enhance clarity, coherence, and conciseness, ensuring smooth transitions between paragraphs and logical flow. Remove any unnecessary jargon and maintain a formal, professional tone suitable for an academic audience.",
-                #         lines=5,
-                #     )
                 with gr.Row():
                     suggest_btn = gr.Button("Suggest template", scale=1)
                     submit_btn = gr.Button("Extract", variant="primary", scale=2)
@@ -322,12 +336,6 @@ with gr.Blocks() as demo:
             with gr.Column():
                 output = gr.Textbox(label="Structured Output", show_copy_button=True)
                 with gr.Accordion("Show Kadi-compatible output", open=False):
-                    # output_diff = gr.HighlightedText(
-                    #     label="Diff",
-                    #     combine_adjacent=True,
-                    #     show_legend=True,
-                    #     color_map={"-": "red", "+": "green"},
-                    # )
                     output_kadi = gr.Textbox(
                         label="Kadi compatible metadata output",
                         lines=5,
@@ -335,9 +343,12 @@ with gr.Blocks() as demo:
                     )
                     gr.Markdown()
-                    gr.Markdown("Add metadata by copying and pasting in [Kadi](https://kadi.iam.kit.edu/) Record")
                     gr.Markdown("![](file/copy_to_kadi.png)")
         submit_btn.click(
             fn=extract_info, inputs=[text_input, structure_template], outputs=output
         )
@@ -349,6 +360,7 @@ with gr.Blocks() as demo:
             fn=transform_json_to_kadi_schema, inputs=[output], outputs=output_kadi
         )
         gr.Markdown()
         gr.Markdown()
         gr.Markdown()

+"""
+This application demo shows how to extract structured information using LLMs
+and transfer it as metadata in Kadi.
+"""
 import os
 import json
 import gradio as gr
 import groq
 from difflib import Differ
+from json2kadi import my_json_to_kadi
 from kadi_apy.lib.conversion import json_to_kadi
 # Set api key of Groq
 api_key = os.getenv("GROQ_API")
+# Examples
 example_1 = (
     """John B. Goodenough (1922–2023) was a renowned American physicist and materials scientist,
 best known for his pioneering work in developing the lithium-ion battery. He earned a Ph.D. in physics from
 """,
 )
 def generate_response(prompt):
+    """
+    Get response (structured json) from LLMs.
+    """
     if not prompt:
         return "No transcription available. Please try speaking again."
 def post_process_output(output):
+    """Clean up output."""
     # 1. remove json mark
     output = output.replace("```", "")
     output = output.replace("null", '""')
     return output
+# Basic prompt for extraction
 extract_info_prompt = """
 You are an data scientist, extract information from text with given template in json format. Do not add any explanation.
 def extract_info(input_text, structure_template):
+    """Extract structured output from text input."""
     # validate structure_template is json
     try:
         structure_template = json.dumps(json.loads(structure_template), indent=4)
         except Exception as e:
             print("Error in json format, retrying...")
             continue
     return structured_output
 def diff_texts(text1, text2):
+    """Compare two text inputs."""
     d = Differ()
     return [
         (token[2:], token[0] if token[0] != " " else None)
 def transform_json_to_kadi_schema(input_json_str):
+    """Tranform json into Kadi metadata schema."""
     input_json = json.loads(input_json_str)
     try:
         output_json = my_json_to_kadi(input_json)
     return json.dumps(output_json, indent=2)
+# Baisc template for inferring json template
 example_structure_template = """
 {
     "Material": {
 """
+# Infer template from text input based on exmaple template defined above
 def suggest_template(input_text):
+    """Infer structured template from text input."""
     if not input_text.strip():
         raise gr.Error("The input text should not be empty.")
     combined_prompt = f"""
     return output
+# Graio UI
 with gr.Blocks() as demo:
     gr.Markdown(
         "### A simple web app to obtain structured output from text input using Large Language Models (LLMs)."
                     placeholder="Enter your structure template here.",
                 )
                 with gr.Row():
                     suggest_btn = gr.Button("Suggest template", scale=1)
                     submit_btn = gr.Button("Extract", variant="primary", scale=2)
             with gr.Column():
                 output = gr.Textbox(label="Structured Output", show_copy_button=True)
                 with gr.Accordion("Show Kadi-compatible output", open=False):
                     output_kadi = gr.Textbox(
                         label="Kadi compatible metadata output",
                         lines=5,
                     )
                     gr.Markdown()
+                    gr.Markdown(
+                        "Add metadata by copying and pasting in [Kadi](https://kadi.iam.kit.edu/) Record"
+                    )
                     gr.Markdown("![](file/copy_to_kadi.png)")
+        # Actions
         submit_btn.click(
             fn=extract_info, inputs=[text_input, structure_template], outputs=output
         )
             fn=transform_json_to_kadi_schema, inputs=[output], outputs=output_kadi
         )
+        # Placeholder
         gr.Markdown()
         gr.Markdown()
         gr.Markdown()

json2kadi.py CHANGED Viewed

@@ -1,46 +1,43 @@
 import json
 def transform_value(key, value):
     if isinstance(value, dict):
-        if 'Value' in value and 'Unit' in value:
-            value_type = "str" if isinstance(value['Value'], str) else "float"
             return {
                 "key": key,
                 "type": "dict",
                 "value": [
-                    {"key": "Value", "type": value_type, "value": value['Value']},
-                    {"key": "Unit", "type": "str", "value": value['Unit']}
-                ]
             }
         else:
             return {
                 "key": key,
                 "type": "dict",
-                "value": [transform_value(k, v) for k, v in value.items()]
             }
     elif isinstance(value, list):
         return {
             "key": key,
             "type": "list",
-            "value": [transform_value("", item) for item in value]
         }
     elif isinstance(value, str):
-        return {
-            "key": key,
-            "type": "str",
-            "value": value
-        }
     else:
         raise ValueError(f"Unsupported value type: {type(value)}")
 def my_json_to_kadi(data):
     return [transform_value(key, value) for key, value in data.items()]
 # Print the output JSON in a formatted way
-# Example JSON input
 input_json = {
     "Material": {
         "Name": "LLTO",
@@ -52,156 +49,96 @@ input_json = {
             "Dendrite Formation Risk": "",
             "Operating Voltage": "",
             "Flexibility": "",
-            "Processing": ""
-        }
     },
     "Performance": {
         "Specific Capacity": {"Value": "", "Unit": ""},
         "Energy Density": {"Value": "", "Unit": ""},
         "Capacity Retention": "",
-        "Operating Temperature": {"Value": "Room temperature", "Unit": ""}
     },
-    "Usage": {
-        "Battery Type": "",
-        "Benefits": []
-    }
 }
-# input_json = {
-#     "Doctor_Patient_Discussion": {
-#         "Initial_Observation": {
-#             "Symptoms": [
-#                 "pale",
-#                 "sore throat",
-#                 "running a temperature"
-#             ],
-#             "Initial_Assessment": "You\u2019ve moderate fever."
-#         },
-#         "Medical_Examination": {
-#             "Temperature": "99.8",
-#             "Blood_Pressure": "fine",
-#             "Doctor_Assessment": "few symptoms of malaria",
-#             "Diagnosis": "few symptoms of malaria"
-#         },
-#         "Treatment_Plan": {
-#             "Prescription": [
-#                 "three medicines",
-#                 "a syrup"
-#             ]
-#         }
-#     }
-# }
-# input_json = {
-#     "Doctor_Patient_Discussion": {
-#         "Initial_Observation": {
-#             "Symptoms": [
-#                 "pale",
-#                 "sore throat",
-#                 "running a temperature"
-#             ],
-#             "Initial_Assessment": "You\u2019ve moderate fever."
-#         },
-#         "Medical_Examination": {
-#             "Temperature": "99.8",
-#             "Blood_Pressure": "fine",
-#             "Doctor_Assessment": "few symptoms of malaria",
-#             "Diagnosis": "few symptoms of malaria"
-#         },
-#         "Treatment_Plan": {
-#             "Prescription": [
-#                 "three medicines",
-#                 "a syrup"
-#             ]
-#         }
-#     }
-# }
 input_json = {
-  "Experiment": {
-    "Material": "LATP powders",
-    "SynthesisRoute": "modified sol-gel synthesis route described by (Bucharsky et al., 2015)",
-    "Precursors": [
-      {
-        "Name": "lithium acetate Li(C2H3O2) ⋅2H2O",
-        "Purity": "purity ≥ 99 %",
-        "Supplier": "Alfa Aesar GmbH & Co KG",
-        "Location": "Germany"
-      },
-      {
-        "Name": "aluminum nitrate Al(NO3)3 ⋅9H2O",
-        "Purity": "purity ≥ 98.5 %",
-        "Supplier": "Merck KGaA",
-        "Location": "Germany"
-      },
-      {
-        "Name": "titanium-isopropoxide Ti[OCH(CH3)2]4",
-        "Purity": "purity ≥ 98 %",
-        "Supplier": "Merck KGaA",
-        "Location": "Germany"
-      }
-    ],
-    "Procedure": [
-      {
-        "Step": "Dissolve lithium acetate and aluminum nitrate in distilled water under constant stirring."
-      },
-      {
-        "Step": "Add titanium-isopropoxide dropwise to the solution."
-      },
-      {
-        "Step": "Add phosphoric acid slowly through a drip funnel to form a gel."
-      },
-      {
-        "Step": "Dry the gel at room temperature for 24 h."
-      }
-    ],
-    "HeatTreatment": [
-      {
-        "Step": "First, heat treat samples at 400°C for 6 h to achieve precursor formation and eliminate reaction gases."
-      },
-      {
-        "Step": "Second, process samples at 900°C for 8 h to complete the reaction to crystalline LATP."
-      }
-    ],
-    "BatchVariations": [
-      {
-        "Description": "Prepare one batch with all precursors in stoichiometric quantities (marked as 0.0 wt%)."
-      },
-      {
-        "Description": "Explore different batches with either an excess up to +7.5 wt% or a deficiency up to -15.0 wt% of phosphoric acid compared to the stoichiometric composition."
-      }
-    ],
-    "Processing": [
-      {
-        "Step": "Process the obtained powders in a planetary ball mill."
-      },
-      {
-        "Step": "Form pellets by uniaxial pressing and then further densify by cold isostatic pressing at 400 MPa."
-      },
-      {
-        "Step": "All pressed samples have a green density of approximately 62% relative density."
-      }
-    ],
-    "Sintering": {
-      "TemperatureRange": "850 to 1,050°C",
-      "IsothermalSinteringTime": "30 to 540 min",
-      "Cooling": "Cool down to room temperature in furnace",
-      "DensityDetermination": "Determine densities by Archimedes’ method"
-    },
-    "IonicConductivityMeasurements": {
-      "Method": "Impedance analysis",
-      "Conditions": "At room temperature over the frequency range from 0.1 Hz to 1 MHz with an AC amplitude of 50 mV in the frequency response analyzer (AMTEK GmbH, VersaSTAT 4, Pennsylvania, United States)",
-      "Reference": "For further details of the experimental part please refer to our previous work (Schiffmann et al., 2021)"
     }
-  }
 }
 if __name__ == "__main__":
-  # Transform the input JSON
-  # output_json = transform_json2kadi(input_json)
-  from kadi_apy.lib.conversion import json_to_kadi
-  output_json = json_to_kadi(input_json)
-  # Print the output JSON
-  print(json.dumps(output_json, indent=2))

 import json
+# Transform value in metadata
 def transform_value(key, value):
     if isinstance(value, dict):
+        if "Value" in value and "Unit" in value:
+            value_type = "str" if isinstance(value["Value"], str) else "float"
             return {
                 "key": key,
                 "type": "dict",
                 "value": [
+                    {"key": "Value", "type": value_type, "value": value["Value"]},
+                    {"key": "Unit", "type": "str", "value": value["Unit"]},
+                ],
             }
         else:
             return {
                 "key": key,
                 "type": "dict",
+                "value": [transform_value(k, v) for k, v in value.items()],
             }
     elif isinstance(value, list):
         return {
             "key": key,
             "type": "list",
+            "value": [transform_value("", item) for item in value],
         }
     elif isinstance(value, str):
+        return {"key": key, "type": "str", "value": value}
     else:
         raise ValueError(f"Unsupported value type: {type(value)}")
 def my_json_to_kadi(data):
     return [transform_value(key, value) for key, value in data.items()]
 # Print the output JSON in a formatted way
+# Some example JSON inputs for testing
 input_json = {
     "Material": {
         "Name": "LLTO",
             "Dendrite Formation Risk": "",
             "Operating Voltage": "",
             "Flexibility": "",
+            "Processing": "",
+        },
     },
     "Performance": {
         "Specific Capacity": {"Value": "", "Unit": ""},
         "Energy Density": {"Value": "", "Unit": ""},
         "Capacity Retention": "",
+        "Operating Temperature": {"Value": "Room temperature", "Unit": ""},
     },
+    "Usage": {"Battery Type": "", "Benefits": []},
 }
+# Another test
 input_json = {
+    "Experiment": {
+        "Material": "LATP powders",
+        "SynthesisRoute": "modified sol-gel synthesis route described by (Bucharsky et al., 2015)",
+        "Precursors": [
+            {
+                "Name": "lithium acetate Li(C2H3O2) ⋅2H2O",
+                "Purity": "purity ≥ 99 %",
+                "Supplier": "Alfa Aesar GmbH & Co KG",
+                "Location": "Germany",
+            },
+            {
+                "Name": "aluminum nitrate Al(NO3)3 ⋅9H2O",
+                "Purity": "purity ≥ 98.5 %",
+                "Supplier": "Merck KGaA",
+                "Location": "Germany",
+            },
+            {
+                "Name": "titanium-isopropoxide Ti[OCH(CH3)2]4",
+                "Purity": "purity ≥ 98 %",
+                "Supplier": "Merck KGaA",
+                "Location": "Germany",
+            },
+        ],
+        "Procedure": [
+            {
+                "Step": "Dissolve lithium acetate and aluminum nitrate in distilled water under constant stirring."
+            },
+            {"Step": "Add titanium-isopropoxide dropwise to the solution."},
+            {"Step": "Add phosphoric acid slowly through a drip funnel to form a gel."},
+            {"Step": "Dry the gel at room temperature for 24 h."},
+        ],
+        "HeatTreatment": [
+            {
+                "Step": "First, heat treat samples at 400°C for 6 h to achieve precursor formation and eliminate reaction gases."
+            },
+            {
+                "Step": "Second, process samples at 900°C for 8 h to complete the reaction to crystalline LATP."
+            },
+        ],
+        "BatchVariations": [
+            {
+                "Description": "Prepare one batch with all precursors in stoichiometric quantities (marked as 0.0 wt%)."
+            },
+            {
+                "Description": "Explore different batches with either an excess up to +7.5 wt% or a deficiency up to -15.0 wt% of phosphoric acid compared to the stoichiometric composition."
+            },
+        ],
+        "Processing": [
+            {"Step": "Process the obtained powders in a planetary ball mill."},
+            {
+                "Step": "Form pellets by uniaxial pressing and then further densify by cold isostatic pressing at 400 MPa."
+            },
+            {
+                "Step": "All pressed samples have a green density of approximately 62% relative density."
+            },
+        ],
+        "Sintering": {
+            "TemperatureRange": "850 to 1,050°C",
+            "IsothermalSinteringTime": "30 to 540 min",
+            "Cooling": "Cool down to room temperature in furnace",
+            "DensityDetermination": "Determine densities by Archimedes’ method",
+        },
+        "IonicConductivityMeasurements": {
+            "Method": "Impedance analysis",
+            "Conditions": "At room temperature over the frequency range from 0.1 Hz to 1 MHz with an AC amplitude of 50 mV in the frequency response analyzer (AMTEK GmbH, VersaSTAT 4, Pennsylvania, United States)",
+            "Reference": "For further details of the experimental part please refer to our previous work (Schiffmann et al., 2021)",
+        },
     }
 }
 if __name__ == "__main__":
+    # Transform the input JSON
+    from kadi_apy.lib.conversion import json_to_kadi
+    output_json = json_to_kadi(input_json)
+    # Print the output JSON
+    print(json.dumps(output_json, indent=2))