Chenxi Whitehouse
commited on
Commit
·
17af92c
1
Parent(s):
122e10f
add files
Browse files- README.md +36 -0
- script/scraper.sh +9 -0
- src/reranking/bm25_sentenes.py +118 -0
- src/retrieval/html2lines.py +84 -0
- src/retrieval/scraper_for_knowledge_store.py +158 -0
README.md
CHANGED
@@ -1,3 +1,39 @@
|
|
1 |
---
|
2 |
license: apache-2.0
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
license: apache-2.0
|
3 |
---
|
4 |
+
|
5 |
+
# AVeriTeC
|
6 |
+
|
7 |
+
|
8 |
+
Data, knowledge store and source code to reproduce the baseline experiments for the [AVeriTeC](https://arxiv.org/abs/2305.13117) dataset, which will be used for the 7th [FEVER](https://fever.ai/) workshop co-hosted at EMNLP 2024.
|
9 |
+
|
10 |
+
|
11 |
+
### Set up environment
|
12 |
+
|
13 |
+
```
|
14 |
+
conda create -n averitec python=3.11
|
15 |
+
conda activate averitec
|
16 |
+
|
17 |
+
pip install -r requirements.txt
|
18 |
+
python -m spacy download en_core_web_lg
|
19 |
+
python -m nltk.downloader punkt
|
20 |
+
python -m nltk.downloader wordnet
|
21 |
+
conda install pytorch pytorch-cuda=11.8 -c pytorch -c nvidia
|
22 |
+
```
|
23 |
+
|
24 |
+
### Scrape text from the URLs obtained by searching queries with the Google API.
|
25 |
+
|
26 |
+
We provide up to 1000 URLs for each claim returned from a Google API search using different queries. This is a courtesy aimed at reducing the cost of using the Google Search API for participants of the shared task. The URL files can be found [here](https://huggingface.co/chenxwh/AVeriTeC/tree/main/data_store/urls).
|
27 |
+
|
28 |
+
You can use your own scraping tool to extract sentences from the URLs. Alternatively, we have included a scraping tool for this purpose, which can be executed as follows. The processed files are also provided and can be found [here](https://huggingface.co/chenxwh/AVeriTeC/tree/main/data_store/knowledge_store).
|
29 |
+
|
30 |
+
```
|
31 |
+
bash script/scraper.sh <split> <start_idx> <end_idx>
|
32 |
+
# e.g., bash script/scraper.sh dev 0 500
|
33 |
+
```
|
34 |
+
|
35 |
+
### Rank the sentences in the knowledge store with BM25
|
36 |
+
See [bm25_sentenes.py](https://huggingface.co/chenxwh/AVeriTeC/tree/main/src/reranking/bm25_sentenes.py) for more args
|
37 |
+
```
|
38 |
+
python -m src.reranking.bm25_sentenes
|
39 |
+
```
|
script/scraper.sh
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
for ((i=$2;i<$3;i++))
|
4 |
+
do
|
5 |
+
echo $i
|
6 |
+
python -m src.retrieval.scraper_for_knowledge_store -i ../AVeriTeC/data_store/"$1"_store/$i.tsv -o data_store/output_"$1" &
|
7 |
+
done
|
8 |
+
|
9 |
+
wait
|
src/reranking/bm25_sentenes.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import time
|
5 |
+
import numpy as np
|
6 |
+
import nltk
|
7 |
+
from rank_bm25 import BM25Okapi
|
8 |
+
|
9 |
+
|
10 |
+
def combine_all_sentences(knowledge_file):
|
11 |
+
# Get all the unique sentences from the scraped urks for this claim
|
12 |
+
sentences, urls = [], []
|
13 |
+
|
14 |
+
with open(knowledge_file, "r", encoding="utf-8") as json_file:
|
15 |
+
for i, line in enumerate(json_file):
|
16 |
+
data = json.loads(line)
|
17 |
+
sentences.extend(data["url2text"])
|
18 |
+
urls.extend([data["url"] for i in range(len(data["url2text"]))])
|
19 |
+
return sentences, urls, i + 1
|
20 |
+
|
21 |
+
|
22 |
+
def retrieve_top_k_sentences(query, document, urls, top_k):
|
23 |
+
tokenized_docs = [nltk.word_tokenize(doc) for doc in document]
|
24 |
+
bm25 = BM25Okapi(tokenized_docs)
|
25 |
+
scores = bm25.get_scores(nltk.word_tokenize(query))
|
26 |
+
top_k_idx = np.argsort(scores)[::-1][:top_k]
|
27 |
+
|
28 |
+
return [document[i] for i in top_k_idx], [urls[i] for i in top_k_idx]
|
29 |
+
|
30 |
+
|
31 |
+
if __name__ == "__main__":
|
32 |
+
|
33 |
+
parser = argparse.ArgumentParser(
|
34 |
+
description="Get top 100 sentences for sentences in the knowlede store"
|
35 |
+
)
|
36 |
+
parser.add_argument(
|
37 |
+
"-k",
|
38 |
+
"--knowledge_store_dir",
|
39 |
+
type=str,
|
40 |
+
default="data_store/output_dev",
|
41 |
+
help="The path of the knowledge_store_dir containing json files with all the retrieved sentences.",
|
42 |
+
)
|
43 |
+
parser.add_argument(
|
44 |
+
"-c",
|
45 |
+
"--claim_file",
|
46 |
+
type=str,
|
47 |
+
default="data/dev.json",
|
48 |
+
help="The path of the file that stores the claim.",
|
49 |
+
)
|
50 |
+
parser.add_argument(
|
51 |
+
"-o",
|
52 |
+
"--json_output",
|
53 |
+
type=str,
|
54 |
+
default="data_store/dev_top_k.json",
|
55 |
+
help="The output dir for JSON files to save the top 100 sentences for each claim.",
|
56 |
+
)
|
57 |
+
parser.add_argument(
|
58 |
+
"--top_k",
|
59 |
+
default=100,
|
60 |
+
type=int,
|
61 |
+
help="How many documents should we pick out with BM25.",
|
62 |
+
)
|
63 |
+
parser.add_argument(
|
64 |
+
"-s",
|
65 |
+
"--start",
|
66 |
+
type=int,
|
67 |
+
default=0,
|
68 |
+
help="Staring index of the files to process.",
|
69 |
+
)
|
70 |
+
parser.add_argument(
|
71 |
+
"-e", "--end", type=int, default=-1, help="End index of the files to process."
|
72 |
+
)
|
73 |
+
|
74 |
+
args = parser.parse_args()
|
75 |
+
|
76 |
+
with open(args.claim_file, "r", encoding="utf-8") as json_file:
|
77 |
+
target_examples = json.load(json_file)
|
78 |
+
|
79 |
+
if args.end == -1:
|
80 |
+
args.end = len(os.listdir(args.knowledge_store_dir))
|
81 |
+
print(args.end)
|
82 |
+
|
83 |
+
files_to_process = list(range(args.start, args.end))
|
84 |
+
total = len(files_to_process)
|
85 |
+
|
86 |
+
with open(args.json_output, "w", encoding="utf-8") as output_json:
|
87 |
+
done = 0
|
88 |
+
for idx, example in enumerate(target_examples):
|
89 |
+
# Load the knowledge store for this example
|
90 |
+
if idx in files_to_process:
|
91 |
+
print(f"Processing claim {idx}... Progress: {done + 1} / {total}")
|
92 |
+
document_in_sentences, sentence_urls, num_urls_this_claim = (
|
93 |
+
combine_all_sentences(
|
94 |
+
os.path.join(args.knowledge_store_dir, f"{idx}.json")
|
95 |
+
)
|
96 |
+
)
|
97 |
+
|
98 |
+
print(
|
99 |
+
f"Obtained {len(document_in_sentences)} sentenes from {num_urls_this_claim} urls."
|
100 |
+
)
|
101 |
+
|
102 |
+
# Retrieve top_k sentences with bm25
|
103 |
+
st = time.time()
|
104 |
+
top_k_sentences, top_k_urls = retrieve_top_k_sentences(
|
105 |
+
example["claim"], document_in_sentences, sentence_urls, args.top_k
|
106 |
+
)
|
107 |
+
print(f"Top {args.top_k} retrieved. Time elapsed: {time.time() - st}.")
|
108 |
+
|
109 |
+
json_data = {
|
110 |
+
"claim_id": idx,
|
111 |
+
"claim": example["claim"],
|
112 |
+
f"top_{args.top_k}": [
|
113 |
+
{"sentence": sent, "url": url}
|
114 |
+
for sent, url in zip(top_k_sentences, top_k_urls)
|
115 |
+
],
|
116 |
+
}
|
117 |
+
output_json.write(json.dumps(json_data, ensure_ascii=False) + "\n")
|
118 |
+
done += 1
|
src/retrieval/html2lines.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from time import sleep
|
3 |
+
import trafilatura
|
4 |
+
from trafilatura.meta import reset_caches
|
5 |
+
from trafilatura.settings import DEFAULT_CONFIG
|
6 |
+
import spacy
|
7 |
+
|
8 |
+
|
9 |
+
nlp = spacy.load("en_core_web_lg")
|
10 |
+
|
11 |
+
|
12 |
+
DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
|
13 |
+
MIN_CHAR = 50
|
14 |
+
MAX_CHAR = 5000
|
15 |
+
|
16 |
+
|
17 |
+
def get_page(url):
|
18 |
+
page = None
|
19 |
+
for _ in range(3):
|
20 |
+
try:
|
21 |
+
# for website that is "maintaining", trafilatura "respect the retry of the html" and waits for 24 hours
|
22 |
+
page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
|
23 |
+
assert page is not None
|
24 |
+
print("Fetched " + url, file=sys.stderr)
|
25 |
+
break
|
26 |
+
except:
|
27 |
+
sleep(3)
|
28 |
+
return page
|
29 |
+
|
30 |
+
|
31 |
+
def url2lines(url):
|
32 |
+
page = get_page(url)
|
33 |
+
|
34 |
+
if page is None:
|
35 |
+
return []
|
36 |
+
|
37 |
+
lines = html2lines(page)
|
38 |
+
return lines
|
39 |
+
|
40 |
+
|
41 |
+
def line_correction(lines, max_size=100):
|
42 |
+
out_lines = []
|
43 |
+
for line in lines:
|
44 |
+
if len(line) < MIN_CHAR:
|
45 |
+
continue
|
46 |
+
|
47 |
+
if len(line) > max_size:
|
48 |
+
doc = nlp(
|
49 |
+
line[:MAX_CHAR]
|
50 |
+
) # We split lines into sentences, but for performance we take only the first 5k characters per line
|
51 |
+
stack = ""
|
52 |
+
for sent in doc.sents:
|
53 |
+
if len(stack) > 0:
|
54 |
+
stack += " "
|
55 |
+
stack += str(sent).strip()
|
56 |
+
if len(stack) > max_size:
|
57 |
+
out_lines.append(stack)
|
58 |
+
stack = ""
|
59 |
+
|
60 |
+
if (
|
61 |
+
len(stack) > MIN_CHAR
|
62 |
+
): # Enusre every lines in the out_lines suffice the MIN_CHAR restriction
|
63 |
+
out_lines.append(stack)
|
64 |
+
else:
|
65 |
+
out_lines.append(line)
|
66 |
+
|
67 |
+
return out_lines
|
68 |
+
|
69 |
+
|
70 |
+
def html2lines(page):
|
71 |
+
out_lines = []
|
72 |
+
|
73 |
+
if len(page.strip()) == 0 or page is None:
|
74 |
+
return out_lines
|
75 |
+
|
76 |
+
text = trafilatura.extract(page, config=DEFAULT_CONFIG)
|
77 |
+
reset_caches()
|
78 |
+
|
79 |
+
if text is None:
|
80 |
+
return out_lines
|
81 |
+
|
82 |
+
return text.split(
|
83 |
+
"\n"
|
84 |
+
) # We just spit out the entire page, so need to reformat later.
|
src/retrieval/scraper_for_knowledge_store.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
import csv
|
4 |
+
from time import sleep
|
5 |
+
import time
|
6 |
+
import json
|
7 |
+
import numpy as np
|
8 |
+
import fitz
|
9 |
+
import pandas as pd
|
10 |
+
import requests
|
11 |
+
from src.retrieval.html2lines import url2lines, line_correction
|
12 |
+
|
13 |
+
csv.field_size_limit(100000000)
|
14 |
+
|
15 |
+
MAX_RETRIES = 3
|
16 |
+
TIMEOUT = 5 # time limit for request
|
17 |
+
|
18 |
+
|
19 |
+
def scrape_text_from_url(url, temp_name):
|
20 |
+
response = None
|
21 |
+
for attempt in range(MAX_RETRIES):
|
22 |
+
try:
|
23 |
+
response = requests.get(url, timeout=TIMEOUT)
|
24 |
+
except requests.RequestException as e:
|
25 |
+
if attempt < MAX_RETRIES - 1:
|
26 |
+
sleep(3) # Wait before retrying
|
27 |
+
|
28 |
+
if (
|
29 |
+
response is None or response.status_code == 503
|
30 |
+
): # trafilatura does not handle retry with 503, often waiting 24hours as overwriten by the html
|
31 |
+
return []
|
32 |
+
|
33 |
+
if url.endswith(".pdf"):
|
34 |
+
with open(f"pdf_dir/{temp_name}.pdf", "wb") as f:
|
35 |
+
f.write(response.content)
|
36 |
+
|
37 |
+
extracted_text = ""
|
38 |
+
doc = fitz.open(f"pdf_dir/{temp_name}.pdf")
|
39 |
+
for page in doc: # iterate the document pages
|
40 |
+
extracted_text += page.get_text() if page.get_text() else ""
|
41 |
+
|
42 |
+
return line_correction(extracted_text.split("\n"))
|
43 |
+
|
44 |
+
return line_correction(url2lines(url))
|
45 |
+
|
46 |
+
|
47 |
+
if __name__ == "__main__":
|
48 |
+
|
49 |
+
parser = argparse.ArgumentParser(description="Scraping text from URL")
|
50 |
+
parser.add_argument(
|
51 |
+
"-i",
|
52 |
+
"--tsv_input_file",
|
53 |
+
type=str,
|
54 |
+
help="The path of the input files containing URLs from Google search.",
|
55 |
+
)
|
56 |
+
parser.add_argument(
|
57 |
+
"-o",
|
58 |
+
"--json_output_dir",
|
59 |
+
type=str,
|
60 |
+
default="output",
|
61 |
+
help="The output JSON file to save the scraped data.",
|
62 |
+
)
|
63 |
+
parser.add_argument(
|
64 |
+
"--overwrite_out_file",
|
65 |
+
action="store_true",
|
66 |
+
)
|
67 |
+
|
68 |
+
args = parser.parse_args()
|
69 |
+
|
70 |
+
assert (
|
71 |
+
os.path.splitext(args.tsv_input_file)[-1] == ".tsv"
|
72 |
+
), "The input should be a tsv file."
|
73 |
+
|
74 |
+
os.makedirs(args.json_output_dir, exist_ok=True)
|
75 |
+
|
76 |
+
total_scraped, empty, total_failed = 0, 0, 0
|
77 |
+
|
78 |
+
print(f"Processing files {args.tsv_input_file}")
|
79 |
+
|
80 |
+
st = time.time()
|
81 |
+
|
82 |
+
claim_id = os.path.splitext(os.path.basename(args.tsv_input_file))[0]
|
83 |
+
json_output_path = os.path.join(args.json_output_dir, f"{claim_id}.json")
|
84 |
+
|
85 |
+
lines_skipped = 0
|
86 |
+
if os.path.exists(json_output_path):
|
87 |
+
if args.overwrite_out_file:
|
88 |
+
os.remove(json_output_path)
|
89 |
+
else:
|
90 |
+
with open(json_output_path, "r", encoding="utf-8") as json_file:
|
91 |
+
existing_data = json_file.readlines()
|
92 |
+
lines_skipped = len(existing_data)
|
93 |
+
print(f" Skipping {lines_skipped} lines in {json_output_path}")
|
94 |
+
|
95 |
+
# Some tsv files will fail to be laoded, try all 4 different libs to to load them
|
96 |
+
try:
|
97 |
+
df = pd.read_csv(args.tsv_input_file, sep="\t", header=None)
|
98 |
+
data = df.values
|
99 |
+
print("Data loaded successfully with Pandas.")
|
100 |
+
|
101 |
+
except Exception as e:
|
102 |
+
print("Error loading with csv:", e)
|
103 |
+
try:
|
104 |
+
data = np.genfromtxt(
|
105 |
+
args.tsv_input_file, delimiter="\t", dtype=None, encoding=None
|
106 |
+
)
|
107 |
+
print("Data loaded successfully with NumPy.")
|
108 |
+
except Exception as e:
|
109 |
+
print("Error loading with NumPy:", e)
|
110 |
+
# If NumPy loading fails, attempt to load with Pandas
|
111 |
+
try:
|
112 |
+
data = []
|
113 |
+
with open(args.tsv_input_file, "r", newline="") as tsvfile:
|
114 |
+
reader = csv.reader(tsvfile, delimiter="\t")
|
115 |
+
for row in reader:
|
116 |
+
data.append(row)
|
117 |
+
print("Data loaded successfully with csv.")
|
118 |
+
except Exception as e:
|
119 |
+
print("Error loading with csv:", e)
|
120 |
+
data = None
|
121 |
+
|
122 |
+
if len(data) == lines_skipped:
|
123 |
+
print(" No more lines need to be processed!")
|
124 |
+
else:
|
125 |
+
with open(json_output_path, "a", encoding="utf-8") as json_file:
|
126 |
+
for index, row in enumerate(data):
|
127 |
+
if index < lines_skipped:
|
128 |
+
continue
|
129 |
+
url = row[2]
|
130 |
+
json_data = {
|
131 |
+
"claim_id": claim_id,
|
132 |
+
"type": row[1],
|
133 |
+
"query": row[3],
|
134 |
+
"url": url,
|
135 |
+
"url2text": [],
|
136 |
+
}
|
137 |
+
print(f"Scraping text for url_{index}: {url}!")
|
138 |
+
try:
|
139 |
+
scrape_result = scrape_text_from_url(url, claim_id)
|
140 |
+
json_data["url2text"] = scrape_result
|
141 |
+
|
142 |
+
if len(json_data["url2text"]) > 0:
|
143 |
+
total_scraped += 1
|
144 |
+
else:
|
145 |
+
empty += 1
|
146 |
+
|
147 |
+
except Exception as e:
|
148 |
+
total_failed += 1
|
149 |
+
|
150 |
+
json_file.write(json.dumps(json_data, ensure_ascii=False) + "\n")
|
151 |
+
json_file.flush()
|
152 |
+
|
153 |
+
print(f"Output for {args.tsv_input_file} saved to {json_output_path}")
|
154 |
+
elapsed_time = time.time() - st
|
155 |
+
elapsed_minutes = int(elapsed_time // 60)
|
156 |
+
elapsed_seconds = int(elapsed_time % 60)
|
157 |
+
print(f"Time elapsed: {elapsed_minutes}min {elapsed_seconds}sec")
|
158 |
+
print(f"{total_scraped} scraped, {empty} empty, {total_failed} failed")
|