allknowingroger Papajams commited on
Commit
4dc73cd
·
0 Parent(s):

Duplicate from Papajams/substantiator

Browse files

Co-authored-by: Papa Jams <[email protected]>

Files changed (5) hide show
  1. .gitattributes +35 -0
  2. README.md +14 -0
  3. app.py +85 -0
  4. huggingface.yaml +20 -0
  5. requirements.txt +5 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Substantiator
3
+ emoji: 🐠
4
+ colorFrom: red
5
+ colorTo: indigo
6
+ sdk: streamlit
7
+ sdk_version: 1.21.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: wtfpl
11
+ duplicated_from: Papajams/substantiator
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import torch
4
+ from transformers import AutoTokenizer, AutoModel
5
+ import xml.etree.ElementTree as ET
6
+
7
+ # Load SciBERT pre-trained model and tokenizer
8
+ model_name = "allenai/scibert_scivocab_uncased"
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+ model = AutoModel.from_pretrained(model_name)
11
+
12
+ def calculate_similarity(claim, document):
13
+ if not claim or not document:
14
+ return 0.0
15
+ # Tokenize claim and document
16
+ inputs = tokenizer.encode_plus(claim, document, return_tensors='pt', padding=True, truncation=True)
17
+
18
+ # Generate embeddings for claim
19
+ with torch.no_grad():
20
+ claim_embeddings = model(**inputs)['pooler_output']
21
+
22
+ # Generate embeddings for document
23
+ inputs_doc = tokenizer.encode_plus(document, return_tensors='pt', padding=True, truncation=True)
24
+ with torch.no_grad():
25
+ document_embeddings = model(**inputs_doc)['pooler_output']
26
+
27
+ # Compute cosine similarity between embeddings
28
+ similarity = torch.cosine_similarity(claim_embeddings, document_embeddings).item()
29
+
30
+ return similarity
31
+
32
+ def search_arxiv(query, max_results=3):
33
+ base_url = "http://export.arxiv.org/api/query?"
34
+ query = f"search_query=all:{query}&start=0&max_results={max_results}&sortBy=relevance&sortOrder=descending"
35
+
36
+ try:
37
+ response = requests.get(base_url + query)
38
+ if response.status_code == 200:
39
+ data = response.content
40
+
41
+ # Parse the XML response
42
+ root = ET.fromstring(data)
43
+
44
+ search_results = []
45
+ for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
46
+ result = {}
47
+
48
+ # Extract information from each entry
49
+ result["title"] = entry.find("{http://www.w3.org/2005/Atom}title").text
50
+ result["abstract"] = entry.find("{http://www.w3.org/2005/Atom}summary").text
51
+ result["link"] = entry.find("{http://www.w3.org/2005/Atom}link[@title='pdf']").attrib["href"]
52
+
53
+ authors = []
54
+ for author in entry.findall("{http://www.w3.org/2005/Atom}author"):
55
+ authors.append(author.find("{http://www.w3.org/2005/Atom}name").text)
56
+ result["authors"] = authors
57
+
58
+ search_results.append(result)
59
+
60
+ return search_results
61
+ except:
62
+ return None
63
+
64
+ def search_papers(user_input):
65
+ # Use the desired search function, e.g., search_arxiv
66
+ search_results = search_arxiv(user_input)
67
+ return search_results
68
+
69
+ st.title('The Substantiator')
70
+
71
+ user_input = st.text_input('Input your claim')
72
+
73
+ if st.button('Substantiate'):
74
+ search_results = search_papers(user_input)
75
+ if search_results is not None and len(search_results) > 0:
76
+ with st.spinner('Searching for relevant research papers...'):
77
+ for result in search_results[:3]:
78
+ st.write(f"<a href='javascript:void(0)' onclick='window.open(\"{result['link']}\", \"_blank\");return false;'>{result['title']}</a>", unsafe_allow_html=True)
79
+ st.write(result["abstract"])
80
+ st.write("Authors: ", ", ".join(result["authors"]))
81
+ similarity = calculate_similarity(user_input, result["abstract"])
82
+ st.write("Similarity Score: ", similarity)
83
+ st.write("-----")
84
+ else:
85
+ st.write("No results found.")
huggingface.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ title: Substantiator
2
+ description: A Streamlit app that uses SciBERT for semantic similarity.
3
+ tags:
4
+ - Scientific
5
+ - NLP
6
+ pipeline:
7
+ - name: Streamlit
8
+ inputs:
9
+ - name: user_input
10
+ type: text
11
+ description: Input your claim
12
+ outputs:
13
+ - name: substantiation
14
+ type: text
15
+ description: Substantiation of the claim
16
+ resources:
17
+ limits:
18
+ cpu: 1
19
+ memory: 2Gi
20
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ requests
3
+ torch
4
+ transformers
5
+ beautifulsoup4