egumasa's picture
pushes source files
5edd591
import re
import os
import spacy_streamlit
from collections import Counter
import glob
import spacy
from spacy.tokens import Doc
from spacy.cli._util import import_code
from utils.visualize import visualize_spans
from utils.utility import preprocess, delete_overlapping_span, cleanup_justify
from resources.text_list import TEXT_LIST
from resources.template_list import TPL_SPAN, TPL_SPAN_SLICE, TPL_SPAN_START
from resources.colors import COLORS_1
from skbio import diversity as dv
from pipeline.post_processors import simple_table, const_table, ngrammar, diversity_values
import pandas as pd
# from pipeline.custom_functions import custom_functions
SPAN_ATTRS = ["text", "label_", "start", "end"]
CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"]
# spacy.prefer_gpu()
def load_model(spacy_model):
# source = spacy.blank("en")
nlp = spacy.load(spacy_model) # , vocab=nlp_to_copy.vocab
nlp.add_pipe('sentencizer')
return (nlp)
# source = spacy.blank("en")
modelname = "en_engagement_LSTM_f3"
# modelname = "en_engagement_LSTM_f5"
# modelname = "en_engagement_Dual_RoBERTa_acad3_f4"
os.makedirs(os.path.join("/Users/masakieguchi/Dropbox/0_Projects/0_basenlp/SFLAnalyzer/engagement-analyzer-demo/results", modelname))
import_code("pipeline/custom_functions.py")
# nlp = spacy.load("en_engagement_three_RoBERTa_base_LSTM384")
nlp = spacy.load(modelname)
# doc = nlp(preprocess(TEXT_LIST[0]))
# cleanup_justify(doc, doc.spans["sc"])
# delete_overlapping_span(doc.spans['sc'])
# data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
# seq = [s for s in doc.spans["sc"]]
# span_ngrams = ngrammar(seq=seq, n=3)
# df = pd.DataFrame(data, columns=cols)
# constant_value = 42
# new_col = pd.Series([constant_value] * df.shape[0], name='new_col')
# doclen = len(doc)
# doc_len = pd.Series([doclen] * df.shape[0], name='nwords')
# df.insert(0, "new", new_col, True)
# df.insert(1, "nwords", doc_len, True)
# df.to_csv("results/test.csv")
# inputfiles = glob.glob("ECCE_texts/preprocessed/*.txt")
inputfiles = glob.glob("ICNALE_texts/*/*.txt")
savedir = "ICNALE_analysis"
storeall = True
storage = []
os.makedirs(os.path.join("ICNALE_analysis", modelname))
doc_level_storage = []
for file in inputfiles:
filename = os.path.split(file)[-1]
with open(file, "r") as f:
text = f.read()
text = preprocess(text)
doc = nlp(text)
cleanup_justify(doc, doc.spans["sc"])
delete_overlapping_span(doc.spans['sc'])
data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
seq = [s for s in doc.spans["sc"]]
span_ngrams = ngrammar(seq=seq, n=3)
### Make it a dataset
df = pd.DataFrame(data, columns=cols)
df = df.astype({"start": int, "end": int}) #convert col type
df = df.sort_values(by= ['start']) #and sort by start
# constant_value = 42
new_col = pd.Series([filename] * df.shape[0], name='filename')
doclen = len(doc)
doc_len = pd.Series([doclen] * df.shape[0], name='nwords')
df.insert(0, "filename", new_col, True)
df.insert(1, "nwords", doc_len, True)
df.to_csv(f"{savedir}/{modelname}/{filename}.csv")
sequences = list(df['label_'])
# Engagement ngrams
span_bigrams = ngrammar(seq=seq, n=2)
bidf = pd.DataFrame(span_bigrams)
# constant_value = 42
new_col = pd.Series([filename] * bidf.shape[0], name='filename')
bidf = bidf.insert(0, "filename", new_col, True)
## Document level
doc_level = {}
counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)
div = diversity_values(list(counts))
div_data = pd.DataFrame.from_dict(div, orient='index')
doc_data = pd.concat([counts, div_data], axis = 0).T
doc_data.insert(0, "filename", filename, True)
doc_data.insert(1, "nwords", doc_len, True)
doc_data.to_csv(f"{savedir}/{modelname}/ddata_{filename}.csv")
if storeall:
storage.append(df)
doc_level_storage.append(doc_data)
alldf = pd.concat(storage)
alldf.to_csv(f"{savedir}/0_{modelname}_20230726.csv")
# alldoc = pd.concat(doc_level_storage)
# alldoc.to_csv(f"{savedir}/1_{modelname}_doc_20230426.csv")