import re import os import spacy_streamlit from collections import Counter import glob import spacy from spacy.tokens import Doc from spacy.cli._util import import_code from utils.visualize import visualize_spans from utils.util import preprocess, delete_overlapping_span, cleanup_justify from resources.text_list import TEXT_LIST from resources.template_list import TPL_SPAN, TPL_SPAN_SLICE, TPL_SPAN_START from resources.colors import COLORS_1 from skbio import diversity as dv from pipeline.post_processors import simple_table, const_table, ngrammar, diversity_values import pandas as pd # from pipeline.custom_functions import custom_functions SPAN_ATTRS = ["text", "label_", "start", "end"] CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"] # spacy.prefer_gpu() def load_model(spacy_model): # source = spacy.blank("en") nlp = spacy.load(spacy_model) # , vocab=nlp_to_copy.vocab nlp.add_pipe('sentencizer') return (nlp) # source = spacy.blank("en") modelname = "en_engagement_LSTM_f3" # modelname = "en_engagement_LSTM_f5" # modelname = "en_engagement_Dual_RoBERTa_acad3_f4" os.makedirs(os.path.join("/Users/masakieguchi/Dropbox/0_Projects/0_basenlp/SFLAnalyzer/engagement-analyzer-demo/results", modelname)) import_code("pipeline/custom_functions.py") # nlp = spacy.load("en_engagement_three_RoBERTa_base_LSTM384") nlp = spacy.load(modelname) # doc = nlp(preprocess(TEXT_LIST[0])) # cleanup_justify(doc, doc.spans["sc"]) # delete_overlapping_span(doc.spans['sc']) # data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS) # seq = [s for s in doc.spans["sc"]] # span_ngrams = ngrammar(seq=seq, n=3) # df = pd.DataFrame(data, columns=cols) # constant_value = 42 # new_col = pd.Series([constant_value] * df.shape[0], name='new_col') # doclen = len(doc) # doc_len = pd.Series([doclen] * df.shape[0], name='nwords') # df.insert(0, "new", new_col, True) # df.insert(1, "nwords", doc_len, True) # df.to_csv("results/test.csv") # inputfiles = glob.glob("ECCE_texts/preprocessed/*.txt") inputfiles = glob.glob("ICNALE_texts/*/*.txt") savedir = "ICNALE_analysis" storeall = True storage = [] os.makedirs(os.path.join("ICNALE_analysis", modelname)) doc_level_storage = [] for file in inputfiles: filename = os.path.split(file)[-1] with open(file, "r") as f: text = f.read() text = preprocess(text) doc = nlp(text) cleanup_justify(doc, doc.spans["sc"]) delete_overlapping_span(doc.spans['sc']) data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS) seq = [s for s in doc.spans["sc"]] span_ngrams = ngrammar(seq=seq, n=3) ### Make it a dataset df = pd.DataFrame(data, columns=cols) df = df.astype({"start": int, "end": int}) #convert col type df = df.sort_values(by= ['start']) #and sort by start # constant_value = 42 new_col = pd.Series([filename] * df.shape[0], name='filename') doclen = len(doc) doc_len = pd.Series([doclen] * df.shape[0], name='nwords') df.insert(0, "filename", new_col, True) df.insert(1, "nwords", doc_len, True) df.to_csv(f"{savedir}/{modelname}/{filename}.csv") sequences = list(df['label_']) # Engagement ngrams span_bigrams = ngrammar(seq=seq, n=2) bidf = pd.DataFrame(span_bigrams) # constant_value = 42 new_col = pd.Series([filename] * bidf.shape[0], name='filename') bidf = bidf.insert(0, "filename", new_col, True) ## Document level doc_level = {} counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0) div = diversity_values(list(counts)) div_data = pd.DataFrame.from_dict(div, orient='index') doc_data = pd.concat([counts, div_data], axis = 0).T doc_data.insert(0, "filename", filename, True) doc_data.insert(1, "nwords", doc_len, True) doc_data.to_csv(f"{savedir}/{modelname}/ddata_{filename}.csv") if storeall: storage.append(df) doc_level_storage.append(doc_data) alldf = pd.concat(storage) alldf.to_csv(f"{savedir}/0_{modelname}_20230726.csv") # alldoc = pd.concat(doc_level_storage) # alldoc.to_csv(f"{savedir}/1_{modelname}_doc_20230426.csv")