Chenxi Whitehouse commited on
Commit
17af92c
·
1 Parent(s): 122e10f
README.md CHANGED
@@ -1,3 +1,39 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+
5
+ # AVeriTeC
6
+
7
+
8
+ Data, knowledge store and source code to reproduce the baseline experiments for the [AVeriTeC](https://arxiv.org/abs/2305.13117) dataset, which will be used for the 7th [FEVER](https://fever.ai/) workshop co-hosted at EMNLP 2024.
9
+
10
+
11
+ ### Set up environment
12
+
13
+ ```
14
+ conda create -n averitec python=3.11
15
+ conda activate averitec
16
+
17
+ pip install -r requirements.txt
18
+ python -m spacy download en_core_web_lg
19
+ python -m nltk.downloader punkt
20
+ python -m nltk.downloader wordnet
21
+ conda install pytorch pytorch-cuda=11.8 -c pytorch -c nvidia
22
+ ```
23
+
24
+ ### Scrape text from the URLs obtained by searching queries with the Google API.
25
+
26
+ We provide up to 1000 URLs for each claim returned from a Google API search using different queries. This is a courtesy aimed at reducing the cost of using the Google Search API for participants of the shared task. The URL files can be found [here](https://huggingface.co/chenxwh/AVeriTeC/tree/main/data_store/urls).
27
+
28
+ You can use your own scraping tool to extract sentences from the URLs. Alternatively, we have included a scraping tool for this purpose, which can be executed as follows. The processed files are also provided and can be found [here](https://huggingface.co/chenxwh/AVeriTeC/tree/main/data_store/knowledge_store).
29
+
30
+ ```
31
+ bash script/scraper.sh <split> <start_idx> <end_idx>
32
+ # e.g., bash script/scraper.sh dev 0 500
33
+ ```
34
+
35
+ ### Rank the sentences in the knowledge store with BM25
36
+ See [bm25_sentenes.py](https://huggingface.co/chenxwh/AVeriTeC/tree/main/src/reranking/bm25_sentenes.py) for more args
37
+ ```
38
+ python -m src.reranking.bm25_sentenes
39
+ ```
script/scraper.sh ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ for ((i=$2;i<$3;i++))
4
+ do
5
+ echo $i
6
+ python -m src.retrieval.scraper_for_knowledge_store -i ../AVeriTeC/data_store/"$1"_store/$i.tsv -o data_store/output_"$1" &
7
+ done
8
+
9
+ wait
src/reranking/bm25_sentenes.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import time
5
+ import numpy as np
6
+ import nltk
7
+ from rank_bm25 import BM25Okapi
8
+
9
+
10
+ def combine_all_sentences(knowledge_file):
11
+ # Get all the unique sentences from the scraped urks for this claim
12
+ sentences, urls = [], []
13
+
14
+ with open(knowledge_file, "r", encoding="utf-8") as json_file:
15
+ for i, line in enumerate(json_file):
16
+ data = json.loads(line)
17
+ sentences.extend(data["url2text"])
18
+ urls.extend([data["url"] for i in range(len(data["url2text"]))])
19
+ return sentences, urls, i + 1
20
+
21
+
22
+ def retrieve_top_k_sentences(query, document, urls, top_k):
23
+ tokenized_docs = [nltk.word_tokenize(doc) for doc in document]
24
+ bm25 = BM25Okapi(tokenized_docs)
25
+ scores = bm25.get_scores(nltk.word_tokenize(query))
26
+ top_k_idx = np.argsort(scores)[::-1][:top_k]
27
+
28
+ return [document[i] for i in top_k_idx], [urls[i] for i in top_k_idx]
29
+
30
+
31
+ if __name__ == "__main__":
32
+
33
+ parser = argparse.ArgumentParser(
34
+ description="Get top 100 sentences for sentences in the knowlede store"
35
+ )
36
+ parser.add_argument(
37
+ "-k",
38
+ "--knowledge_store_dir",
39
+ type=str,
40
+ default="data_store/output_dev",
41
+ help="The path of the knowledge_store_dir containing json files with all the retrieved sentences.",
42
+ )
43
+ parser.add_argument(
44
+ "-c",
45
+ "--claim_file",
46
+ type=str,
47
+ default="data/dev.json",
48
+ help="The path of the file that stores the claim.",
49
+ )
50
+ parser.add_argument(
51
+ "-o",
52
+ "--json_output",
53
+ type=str,
54
+ default="data_store/dev_top_k.json",
55
+ help="The output dir for JSON files to save the top 100 sentences for each claim.",
56
+ )
57
+ parser.add_argument(
58
+ "--top_k",
59
+ default=100,
60
+ type=int,
61
+ help="How many documents should we pick out with BM25.",
62
+ )
63
+ parser.add_argument(
64
+ "-s",
65
+ "--start",
66
+ type=int,
67
+ default=0,
68
+ help="Staring index of the files to process.",
69
+ )
70
+ parser.add_argument(
71
+ "-e", "--end", type=int, default=-1, help="End index of the files to process."
72
+ )
73
+
74
+ args = parser.parse_args()
75
+
76
+ with open(args.claim_file, "r", encoding="utf-8") as json_file:
77
+ target_examples = json.load(json_file)
78
+
79
+ if args.end == -1:
80
+ args.end = len(os.listdir(args.knowledge_store_dir))
81
+ print(args.end)
82
+
83
+ files_to_process = list(range(args.start, args.end))
84
+ total = len(files_to_process)
85
+
86
+ with open(args.json_output, "w", encoding="utf-8") as output_json:
87
+ done = 0
88
+ for idx, example in enumerate(target_examples):
89
+ # Load the knowledge store for this example
90
+ if idx in files_to_process:
91
+ print(f"Processing claim {idx}... Progress: {done + 1} / {total}")
92
+ document_in_sentences, sentence_urls, num_urls_this_claim = (
93
+ combine_all_sentences(
94
+ os.path.join(args.knowledge_store_dir, f"{idx}.json")
95
+ )
96
+ )
97
+
98
+ print(
99
+ f"Obtained {len(document_in_sentences)} sentenes from {num_urls_this_claim} urls."
100
+ )
101
+
102
+ # Retrieve top_k sentences with bm25
103
+ st = time.time()
104
+ top_k_sentences, top_k_urls = retrieve_top_k_sentences(
105
+ example["claim"], document_in_sentences, sentence_urls, args.top_k
106
+ )
107
+ print(f"Top {args.top_k} retrieved. Time elapsed: {time.time() - st}.")
108
+
109
+ json_data = {
110
+ "claim_id": idx,
111
+ "claim": example["claim"],
112
+ f"top_{args.top_k}": [
113
+ {"sentence": sent, "url": url}
114
+ for sent, url in zip(top_k_sentences, top_k_urls)
115
+ ],
116
+ }
117
+ output_json.write(json.dumps(json_data, ensure_ascii=False) + "\n")
118
+ done += 1
src/retrieval/html2lines.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from time import sleep
3
+ import trafilatura
4
+ from trafilatura.meta import reset_caches
5
+ from trafilatura.settings import DEFAULT_CONFIG
6
+ import spacy
7
+
8
+
9
+ nlp = spacy.load("en_core_web_lg")
10
+
11
+
12
+ DEFAULT_CONFIG.MAX_FILE_SIZE = 50000
13
+ MIN_CHAR = 50
14
+ MAX_CHAR = 5000
15
+
16
+
17
+ def get_page(url):
18
+ page = None
19
+ for _ in range(3):
20
+ try:
21
+ # for website that is "maintaining", trafilatura "respect the retry of the html" and waits for 24 hours
22
+ page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG)
23
+ assert page is not None
24
+ print("Fetched " + url, file=sys.stderr)
25
+ break
26
+ except:
27
+ sleep(3)
28
+ return page
29
+
30
+
31
+ def url2lines(url):
32
+ page = get_page(url)
33
+
34
+ if page is None:
35
+ return []
36
+
37
+ lines = html2lines(page)
38
+ return lines
39
+
40
+
41
+ def line_correction(lines, max_size=100):
42
+ out_lines = []
43
+ for line in lines:
44
+ if len(line) < MIN_CHAR:
45
+ continue
46
+
47
+ if len(line) > max_size:
48
+ doc = nlp(
49
+ line[:MAX_CHAR]
50
+ ) # We split lines into sentences, but for performance we take only the first 5k characters per line
51
+ stack = ""
52
+ for sent in doc.sents:
53
+ if len(stack) > 0:
54
+ stack += " "
55
+ stack += str(sent).strip()
56
+ if len(stack) > max_size:
57
+ out_lines.append(stack)
58
+ stack = ""
59
+
60
+ if (
61
+ len(stack) > MIN_CHAR
62
+ ): # Enusre every lines in the out_lines suffice the MIN_CHAR restriction
63
+ out_lines.append(stack)
64
+ else:
65
+ out_lines.append(line)
66
+
67
+ return out_lines
68
+
69
+
70
+ def html2lines(page):
71
+ out_lines = []
72
+
73
+ if len(page.strip()) == 0 or page is None:
74
+ return out_lines
75
+
76
+ text = trafilatura.extract(page, config=DEFAULT_CONFIG)
77
+ reset_caches()
78
+
79
+ if text is None:
80
+ return out_lines
81
+
82
+ return text.split(
83
+ "\n"
84
+ ) # We just spit out the entire page, so need to reformat later.
src/retrieval/scraper_for_knowledge_store.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import csv
4
+ from time import sleep
5
+ import time
6
+ import json
7
+ import numpy as np
8
+ import fitz
9
+ import pandas as pd
10
+ import requests
11
+ from src.retrieval.html2lines import url2lines, line_correction
12
+
13
+ csv.field_size_limit(100000000)
14
+
15
+ MAX_RETRIES = 3
16
+ TIMEOUT = 5 # time limit for request
17
+
18
+
19
+ def scrape_text_from_url(url, temp_name):
20
+ response = None
21
+ for attempt in range(MAX_RETRIES):
22
+ try:
23
+ response = requests.get(url, timeout=TIMEOUT)
24
+ except requests.RequestException as e:
25
+ if attempt < MAX_RETRIES - 1:
26
+ sleep(3) # Wait before retrying
27
+
28
+ if (
29
+ response is None or response.status_code == 503
30
+ ): # trafilatura does not handle retry with 503, often waiting 24hours as overwriten by the html
31
+ return []
32
+
33
+ if url.endswith(".pdf"):
34
+ with open(f"pdf_dir/{temp_name}.pdf", "wb") as f:
35
+ f.write(response.content)
36
+
37
+ extracted_text = ""
38
+ doc = fitz.open(f"pdf_dir/{temp_name}.pdf")
39
+ for page in doc: # iterate the document pages
40
+ extracted_text += page.get_text() if page.get_text() else ""
41
+
42
+ return line_correction(extracted_text.split("\n"))
43
+
44
+ return line_correction(url2lines(url))
45
+
46
+
47
+ if __name__ == "__main__":
48
+
49
+ parser = argparse.ArgumentParser(description="Scraping text from URL")
50
+ parser.add_argument(
51
+ "-i",
52
+ "--tsv_input_file",
53
+ type=str,
54
+ help="The path of the input files containing URLs from Google search.",
55
+ )
56
+ parser.add_argument(
57
+ "-o",
58
+ "--json_output_dir",
59
+ type=str,
60
+ default="output",
61
+ help="The output JSON file to save the scraped data.",
62
+ )
63
+ parser.add_argument(
64
+ "--overwrite_out_file",
65
+ action="store_true",
66
+ )
67
+
68
+ args = parser.parse_args()
69
+
70
+ assert (
71
+ os.path.splitext(args.tsv_input_file)[-1] == ".tsv"
72
+ ), "The input should be a tsv file."
73
+
74
+ os.makedirs(args.json_output_dir, exist_ok=True)
75
+
76
+ total_scraped, empty, total_failed = 0, 0, 0
77
+
78
+ print(f"Processing files {args.tsv_input_file}")
79
+
80
+ st = time.time()
81
+
82
+ claim_id = os.path.splitext(os.path.basename(args.tsv_input_file))[0]
83
+ json_output_path = os.path.join(args.json_output_dir, f"{claim_id}.json")
84
+
85
+ lines_skipped = 0
86
+ if os.path.exists(json_output_path):
87
+ if args.overwrite_out_file:
88
+ os.remove(json_output_path)
89
+ else:
90
+ with open(json_output_path, "r", encoding="utf-8") as json_file:
91
+ existing_data = json_file.readlines()
92
+ lines_skipped = len(existing_data)
93
+ print(f" Skipping {lines_skipped} lines in {json_output_path}")
94
+
95
+ # Some tsv files will fail to be laoded, try all 4 different libs to to load them
96
+ try:
97
+ df = pd.read_csv(args.tsv_input_file, sep="\t", header=None)
98
+ data = df.values
99
+ print("Data loaded successfully with Pandas.")
100
+
101
+ except Exception as e:
102
+ print("Error loading with csv:", e)
103
+ try:
104
+ data = np.genfromtxt(
105
+ args.tsv_input_file, delimiter="\t", dtype=None, encoding=None
106
+ )
107
+ print("Data loaded successfully with NumPy.")
108
+ except Exception as e:
109
+ print("Error loading with NumPy:", e)
110
+ # If NumPy loading fails, attempt to load with Pandas
111
+ try:
112
+ data = []
113
+ with open(args.tsv_input_file, "r", newline="") as tsvfile:
114
+ reader = csv.reader(tsvfile, delimiter="\t")
115
+ for row in reader:
116
+ data.append(row)
117
+ print("Data loaded successfully with csv.")
118
+ except Exception as e:
119
+ print("Error loading with csv:", e)
120
+ data = None
121
+
122
+ if len(data) == lines_skipped:
123
+ print(" No more lines need to be processed!")
124
+ else:
125
+ with open(json_output_path, "a", encoding="utf-8") as json_file:
126
+ for index, row in enumerate(data):
127
+ if index < lines_skipped:
128
+ continue
129
+ url = row[2]
130
+ json_data = {
131
+ "claim_id": claim_id,
132
+ "type": row[1],
133
+ "query": row[3],
134
+ "url": url,
135
+ "url2text": [],
136
+ }
137
+ print(f"Scraping text for url_{index}: {url}!")
138
+ try:
139
+ scrape_result = scrape_text_from_url(url, claim_id)
140
+ json_data["url2text"] = scrape_result
141
+
142
+ if len(json_data["url2text"]) > 0:
143
+ total_scraped += 1
144
+ else:
145
+ empty += 1
146
+
147
+ except Exception as e:
148
+ total_failed += 1
149
+
150
+ json_file.write(json.dumps(json_data, ensure_ascii=False) + "\n")
151
+ json_file.flush()
152
+
153
+ print(f"Output for {args.tsv_input_file} saved to {json_output_path}")
154
+ elapsed_time = time.time() - st
155
+ elapsed_minutes = int(elapsed_time // 60)
156
+ elapsed_seconds = int(elapsed_time % 60)
157
+ print(f"Time elapsed: {elapsed_minutes}min {elapsed_seconds}sec")
158
+ print(f"{total_scraped} scraped, {empty} empty, {total_failed} failed")