|
import re |
|
import os |
|
import spacy_streamlit |
|
from collections import Counter |
|
import glob |
|
|
|
import spacy |
|
from spacy.tokens import Doc |
|
from spacy.cli._util import import_code |
|
|
|
from utils.visualize import visualize_spans |
|
from utils.util import preprocess, delete_overlapping_span, cleanup_justify |
|
|
|
from resources.text_list import TEXT_LIST |
|
from resources.template_list import TPL_SPAN, TPL_SPAN_SLICE, TPL_SPAN_START |
|
from resources.colors import COLORS_1 |
|
|
|
|
|
from skbio import diversity as dv |
|
|
|
from pipeline.post_processors import simple_table, const_table, ngrammar, diversity_values |
|
import pandas as pd |
|
|
|
|
|
SPAN_ATTRS = ["text", "label_", "start", "end"] |
|
CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"] |
|
|
|
|
|
|
|
|
|
def load_model(spacy_model): |
|
|
|
nlp = spacy.load(spacy_model) |
|
nlp.add_pipe('sentencizer') |
|
return (nlp) |
|
|
|
|
|
|
|
|
|
|
|
modelname = "en_engagement_Dual_RoBERTa_acad3_f4" |
|
|
|
os.makedirs(os.path.join("/Users/masakieguchi/Dropbox/0_Projects/0_basenlp/SFLAnalyzer/engagement-analyzer-demo/results", modelname)) |
|
|
|
import_code("pipeline/custom_functions.py") |
|
|
|
|
|
nlp = spacy.load(modelname) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
inputfiles = glob.glob("ECCE_texts/preprocessed/*.txt") |
|
savedir = "ECCE_analysis" |
|
storeall = True |
|
storage = [] |
|
os.makedirs(os.path.join("ECCE_analysis", modelname)) |
|
|
|
|
|
doc_level_storage = [] |
|
|
|
for file in inputfiles: |
|
|
|
filename = os.path.split(file)[-1] |
|
|
|
with open(file, "r") as f: |
|
text = f.read() |
|
|
|
text = preprocess(text) |
|
doc = nlp(text) |
|
cleanup_justify(doc, doc.spans["sc"]) |
|
delete_overlapping_span(doc.spans['sc']) |
|
|
|
data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS) |
|
seq = [s for s in doc.spans["sc"]] |
|
span_ngrams = ngrammar(seq=seq, n=3) |
|
|
|
|
|
|
|
df = pd.DataFrame(data, columns=cols) |
|
df = df.astype({"start": int, "end": int}) |
|
df = df.sort_values(by= ['start']) |
|
|
|
new_col = pd.Series([filename] * df.shape[0], name='filename') |
|
|
|
doclen = len(doc) |
|
doc_len = pd.Series([doclen] * df.shape[0], name='nwords') |
|
|
|
df.insert(0, "filename", new_col, True) |
|
df.insert(1, "nwords", doc_len, True) |
|
df.to_csv(f"{savedir}/{modelname}/{filename}.csv") |
|
|
|
sequences = list(df['label_']) |
|
|
|
span_bigrams = ngrammar(seq=seq, n=2) |
|
bidf = pd.DataFrame(span_bigrams) |
|
|
|
|
|
new_col = pd.Series([filename] * bidf.shape[0], name='filename') |
|
bidf = bidf.insert(0, "filename", new_col, True) |
|
|
|
|
|
|
|
doc_level = {} |
|
counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0) |
|
div = diversity_values(list(counts)) |
|
|
|
div_data = pd.DataFrame.from_dict(div, orient='index') |
|
|
|
doc_data = pd.concat([counts, div_data], axis = 0).T |
|
doc_data.insert(0, "filename", filename, True) |
|
doc_data.insert(1, "nwords", doc_len, True) |
|
doc_data.to_csv(f"{savedir}/{modelname}/ddata_{filename}.csv") |
|
|
|
if storeall: |
|
storage.append(df) |
|
doc_level_storage.append(doc_data) |
|
|
|
|
|
alldf = pd.concat(storage) |
|
|
|
alldf.to_csv(f"{savedir}/0_{modelname}_20230426.csv") |
|
|
|
|
|
alldoc = pd.concat(doc_level_storage) |
|
alldoc.to_csv(f"{savedir}/1_{modelname}_doc_20230426.csv") |
|
|