Spaces:
Sleeping
Sleeping
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Sample application to demo the `TextGraphs` library. | |
see copyright/license https://huggingface.co/spaces/DerwenAI/textgraphs/blob/main/README.md | |
""" | |
import asyncio | |
import sys # pylint: disable=W0611 | |
import traceback | |
import time | |
import typing | |
from icecream import ic # pylint: disable=E0401 | |
from pyinstrument import Profiler # pylint: disable=E0401 | |
import matplotlib.pyplot as plt # pylint: disable=E0401 | |
import pandas as pd # pylint: disable=E0401 | |
import textgraphs | |
if __name__ == "__main__": | |
SRC_TEXT: str = """ | |
Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog. | |
After the war, Werner fled to America to become famous. | |
""" | |
## set up | |
## NB: profiler raises handler exceptions when `concur = False` | |
debug: bool = False # True | |
concur: bool = True # False | |
profile: bool = True # False | |
if profile: | |
profiler: Profiler = Profiler() | |
profiler.start() | |
try: | |
start_time: float = time.time() | |
tg: textgraphs.TextGraphs = textgraphs.TextGraphs( | |
factory = textgraphs.PipelineFactory( | |
spacy_model = textgraphs.SPACY_MODEL, | |
ner = None, #textgraphs.NERSpanMarker(), | |
kg = textgraphs.KGWikiMedia( | |
spotlight_api = textgraphs.DBPEDIA_SPOTLIGHT_API, | |
dbpedia_search_api = textgraphs.DBPEDIA_SEARCH_API, | |
dbpedia_sparql_api = textgraphs.DBPEDIA_SPARQL_API, | |
wikidata_api = textgraphs.WIKIDATA_API, | |
), | |
infer_rels = [ | |
textgraphs.InferRel_OpenNRE( | |
model = textgraphs.OPENNRE_MODEL, | |
max_skip = textgraphs.MAX_SKIP, | |
min_prob = textgraphs.OPENNRE_MIN_PROB, | |
), | |
textgraphs.InferRel_Rebel( | |
lang = "en_XX", | |
mrebel_model = textgraphs.MREBEL_MODEL, | |
), | |
], | |
), | |
) | |
duration: float = round(time.time() - start_time, 3) | |
print(f"{duration:7.3f} sec: set up") | |
## NLP parse | |
start_time = time.time() | |
pipe: textgraphs.Pipeline = tg.create_pipeline( | |
SRC_TEXT.strip(), | |
) | |
duration = round(time.time() - start_time, 3) | |
print(f"{duration:7.3f} sec: parse text") | |
## collect graph elements from the parse | |
start_time = time.time() | |
tg.collect_graph_elements( | |
pipe, | |
debug = debug, | |
) | |
duration = round(time.time() - start_time, 3) | |
print(f"{duration:7.3f} sec: collect elements") | |
## perform entity linking | |
start_time = time.time() | |
tg.perform_entity_linking( | |
pipe, | |
debug = debug, | |
) | |
duration = round(time.time() - start_time, 3) | |
print(f"{duration:7.3f} sec: entity linking") | |
## perform concurrent relation extraction | |
start_time = time.time() | |
if concur: | |
try: | |
loop = asyncio.get_running_loop() | |
except RuntimeError: | |
loop = asyncio.new_event_loop() | |
asyncio.set_event_loop(loop) | |
inferred_edges: list = loop.run_until_complete( | |
tg.infer_relations_async( | |
pipe, | |
debug = debug, | |
) | |
) | |
else: | |
inferred_edges = tg.infer_relations( | |
pipe, | |
debug = debug, | |
) | |
duration = round(time.time() - start_time, 3) | |
print(f"{duration:7.3f} sec: relation extraction") | |
n_list: list = list(tg.nodes.values()) | |
df_rel: pd.DataFrame = pd.DataFrame.from_dict([ | |
{ | |
"src": n_list[edge.src_node].text, | |
"dst": n_list[edge.dst_node].text, | |
"rel": pipe.kg.normalize_prefix(edge.rel), | |
"weight": edge.prob, | |
} | |
for edge in inferred_edges | |
]) | |
ic(df_rel) | |
## construct the _lemma graph_ | |
start_time = time.time() | |
tg.construct_lemma_graph( | |
debug = debug, | |
) | |
duration = round(time.time() - start_time, 3) | |
print(f"{duration:7.3f} sec: construct graph") | |
## rank the extracted phrases | |
start_time = time.time() | |
tg.calc_phrase_ranks( | |
pr_alpha = textgraphs.PAGERANK_ALPHA, | |
debug = debug, | |
) | |
duration = round(time.time() - start_time, 3) | |
print(f"{duration:7.3f} sec: rank phrases") | |
## show the extracted phrase results | |
ic(tg.get_phrases_as_df()) | |
if debug: # pylint: disable=W0101 | |
for key, node in tg.nodes.items(): | |
print(key, node) | |
for key, edge in tg.edges.items(): | |
print(key, edge) | |
except Exception as ex: # pylint: disable=W0718 | |
ic(ex) | |
traceback.print_exc() | |
## transform graph data to a _graph of relations_ | |
start_time = time.time() | |
gor: textgraphs.GraphOfRelations = textgraphs.GraphOfRelations( | |
tg, | |
) | |
gor.seeds( | |
debug = False, # True | |
) | |
gor.construct_gor( | |
debug = False, # True | |
) | |
_scores: typing.Dict[ tuple, float ] = gor.get_affinity_scores( | |
debug = False, # True | |
) | |
duration = round(time.time() - start_time, 3) | |
print(f"{duration:7.3f} sec: graph of relations") | |
gor.render_gor_plt(_scores) | |
plt.show() | |
#sys.exit(0) | |
###################################################################### | |
## stack profiler report | |
if profile: | |
profiler.stop() | |
profiler.print() | |
## output lemma graph as JSON | |
with open("lemma.json", "w", encoding = "utf-8") as fp: | |
fp.write(tg.dump_lemma_graph()) | |