Spaces:
Sleeping
Sleeping
File size: 7,351 Bytes
6ab9084 0723a2c 6ab9084 4133681 0723a2c 4133681 0723a2c 6ab9084 4133681 0723a2c 4133681 0723a2c 4133681 0723a2c 4133681 0723a2c 4133681 0723a2c 4133681 0723a2c 4133681 0723a2c d79d1b9 0723a2c 4133681 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import pysbd
from txtai.embeddings import Embeddings
import networkx as nx
from tqdm import tqdm
from txtai.graph import GraphFactory
from datasets import load_dataset
import streamlit as st
import streamlit.components.v1 as components
import string
st.set_page_config(page_title="DebateKG")
st.title("DebateKG - Automatic Policy Debate Case Creation")
st.caption("github: https://github.com/Hellisotherpeople/DebateKG")
form = st.sidebar.form("Main Settings")
form.header("Main Settings")
highlight_threshold = form.number_input("Enter the minimum similarity value needed to highlight" , value = 0.05)
show_extract = form.checkbox("Show extracts", value = True)
show_abstract = form.checkbox("Show abstract", value = False)
show_full_doc = form.checkbox("Show full doc", value = False)
show_citation = form.checkbox("Show citation", value = True)
rerank_word = form.text_input("(Optional) Constrain all evidence in the case to have this word within its text", value = "")
form.caption("Doing this may create graphs which are so constrained that DebateKG can't find a valid path in the graph to build a case")
html_window_width = form.number_input("Enter the pixel width of the output debate case window", value = 1000)
html_window_height = form.number_input("Enter the pixel height of the output debate case window", value = 1000)
option = form.selectbox(
'Which Knowledge Graph do you want to use?',
('DebateSum_SemanticGraph_longformer_extract.tar.gz', 'DebateSum_SemanticGraph_longformer_abstract.tar.gz', 'DebateSum_SemanticGraph_mpnet_abstract.tar.gz', 'DebateSum_SemanticGraph_legalbert_abstract.tar.gz', 'DebateSum_SemanticGraph_legalbert_extract.tar.gz', 'DebateSum_SemanticGraph_mpnet_extract.tar.gz', 'DebateSum_SemanticGraph_mpnet_sentence.tar.gz'), index = 2)
form.form_submit_button("Change Settings")
@st.cache(allow_output_mutation=True)
def load_my_dataset():
dataset = load_dataset("Hellisotherpeople/DebateSum", split = "train")
return dataset
@st.cache(allow_output_mutation=True)
def load_embeddings():
embeddings = Embeddings({
"path": "sentence-transformers/all-mpnet-base-v2",
"content": True,
"functions": [
{"name": "graph", "function": "graph.attribute"},
],
"expressions": [
{"name": "topic", "expression": "graph(indexid, 'topic')"},
{"name": "topicrank", "expression": "graph(indexid, 'topicrank')"}
],
"graph": {
"limit": 100,
"minscore": 0.10,
"topics": {
"terms": 4,
"resolution" : 100
}
}
})
embeddings.load(option)
return embeddings
dataset = load_my_dataset()
embeddings = load_embeddings()
graph = embeddings.graph
def david_distance(source, target, attrs):
distance = max(1.0 - attrs["weight"], 0.0)
return distance if distance >= 0.15 else 1.00
def david_showpath(source, target, the_graph):
return nx.shortest_path(the_graph, source, target, david_distance)
def david_show_all_paths(source, target, the_graph):
return nx.all_shortest_paths(the_graph, source, target, david_distance)
def highlight(index, result):
output = f"{index}. "
spans = [(token, score, "#fff59d" if score > highlight_threshold else None) for token, score in result["tokens"]]
for token, _, color in spans:
output += f"<span style='background-color: {color}'>{token}</span> " if color else f"{token} "
return output
def showpath_any(list_of_arguments, strip_punctuation = True, the_graph=graph.backend):
list_of_paths = []
for x, y in zip(list_of_arguments, list_of_arguments[1:]):
a_path = david_showpath(x, y, the_graph)
list_of_paths.extend(a_path)
#print(list_of_paths)
path = [graph.attribute(p, "text") for p in list_of_paths]
list_of_evidence_ids = []
for text in path:
if strip_punctuation:
text = text.translate(str.maketrans("","", string.punctuation))
list_of_evidence_ids.append(int(embeddings.search(f"select id from txtai where similar('{text}') limit 1")[0]['id']))
sections = []
#sections.append(list_of_evidence_ids)
for x, p in enumerate(path):
if x == 0:
# Print start node
sections.append(f"{x + 1}. {p}")
if show_abstract:
sections.append(dataset["Abstract"][list_of_evidence_ids[x]])
if show_citation:
sections.append(dataset["Citation"][list_of_evidence_ids[x]])
if show_extract:
sections.append(dataset["Extract"][list_of_evidence_ids[x]])
if show_full_doc:
sections.append(dataset["Full-Document"][list_of_evidence_ids[x]])
if x < len(path) - 1:
# Explain and highlight next path element
results = embeddings.explain(p, [path[x + 1]], limit=1)[0]
sections.append(highlight(x + 2, results))
if show_abstract:
sections.append(dataset["Abstract"][list_of_evidence_ids[x+1]])
if show_citation:
sections.append(dataset["Citation"][list_of_evidence_ids[x+1]])
if show_extract:
sections.append(dataset["Extract"][list_of_evidence_ids[x+1]])
if show_full_doc:
sections.append(dataset["Full-Document"][list_of_evidence_ids[x+1]])
return components.html("<br/><br/>".join(sections), scrolling = True, width = html_window_width, height = html_window_height)
def question(text, rerank_word = "", rerank_topic = "", limit = 100):
return embeddings.search(f"select id, text, topic, evidence_id, score from txtai where similar('{text}') and text like '%{rerank_word}%' and topic like '%{rerank_topic}%' limit {limit}")
query_form = st.form("Query the Index:")
query_form.write("Step 1: Find Arguments")
query_form.write("Use semantic SQL from txtai to find some arguments, we use indexids to keep track of them.")
query_form.caption("You can use the semantic SQL to explore the dataset too! The possibilities are limitless!")
query_sql = query_form.text_area("Enter a semantic SQL statement", value = f"select topic, * from txtai where similar('Trump and US relations with China') and topic like '%trump%' and text like '%china%' limit 1")
query_form_submitted = query_form.form_submit_button("Query")
if query_form_submitted:
with st.expander("Output (Open Me)", expanded = False):
#my_path = showpath_any([170750, 50, 23])
#st.write(embeddings.search(f"select * from txtai where similar('you') and text like '%the%' limit 10"))
st.write(embeddings.search(query_sql))
paths_form = st.form("Build the Arguments")
paths_form.write("Step 2: Build a Policy Debate Case")
paths_form.write("Enter any number of indexids (arguments), DebateKG will build a debate case out of it which links them all together")
user_paths_string = paths_form.text_area("Enter a list of indexids seperated by whitespace", value = "250 10000 2405")
user_paths_list_of_strings = user_paths_string.split()
user_paths_list = list(map(int, user_paths_list_of_strings))
paths_form_submitted = paths_form.form_submit_button("Build a Policy Debate Case")
if paths_form_submitted:
if rerank_word:
selected_nodes = [n for n,v in graph.backend.nodes(data=True) if rerank_word in v['text']] ##also works for topic
H = graph.backend.subgraph(selected_nodes)
showpath_any(user_paths_list, the_graph = H)
else:
showpath_any(user_paths_list)
|