Spaces:

egumasa
/

engagement-analyzer-demo

Running

App Files Files Community

engagement-analyzer-demo / analyzer.py

egumasa

rm private files

f38de6b about 1 year ago

raw

history blame

4.24 kB

	import re
	import os
	import spacy_streamlit
	from collections import Counter
	import glob

	import spacy
	from spacy.tokens import Doc
	from spacy.cli._util import import_code

	from utils.visualize import visualize_spans
	from utils.util import preprocess, delete_overlapping_span, cleanup_justify

	from resources.text_list import TEXT_LIST
	from resources.template_list import TPL_SPAN, TPL_SPAN_SLICE, TPL_SPAN_START
	from resources.colors import COLORS_1


	from skbio import diversity as dv

	from pipeline.post_processors import simple_table, const_table, ngrammar, diversity_values
	import pandas as pd

	# from pipeline.custom_functions import custom_functions
	SPAN_ATTRS = ["text", "label_", "start", "end"]
	CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"]


	# spacy.prefer_gpu()

	def load_model(spacy_model):
	# source = spacy.blank("en")
	nlp = spacy.load(spacy_model) # , vocab=nlp_to_copy.vocab
	nlp.add_pipe('sentencizer')
	return (nlp)

	# source = spacy.blank("en")

	modelname = "en_engagement_LSTM_f3"
	# modelname = "en_engagement_LSTM_f5"
	# modelname = "en_engagement_Dual_RoBERTa_acad3_f4"

	os.makedirs(os.path.join("/Users/masakieguchi/Dropbox/0_Projects/0_basenlp/SFLAnalyzer/engagement-analyzer-demo/results", modelname))

	import_code("pipeline/custom_functions.py")

	# nlp = spacy.load("en_engagement_three_RoBERTa_base_LSTM384")
	nlp = spacy.load(modelname)
	# doc = nlp(preprocess(TEXT_LIST[0]))

	# cleanup_justify(doc, doc.spans["sc"])
	# delete_overlapping_span(doc.spans['sc'])

	# data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
	# seq = [s for s in doc.spans["sc"]]
	# span_ngrams = ngrammar(seq=seq, n=3)

	# df = pd.DataFrame(data, columns=cols)

	# constant_value = 42
	# new_col = pd.Series([constant_value] * df.shape[0], name='new_col')

	# doclen = len(doc)
	# doc_len = pd.Series([doclen] * df.shape[0], name='nwords')

	# df.insert(0, "new", new_col, True)
	# df.insert(1, "nwords", doc_len, True)

	# df.to_csv("results/test.csv")



	# inputfiles = glob.glob("ECCE_texts/preprocessed/*.txt")
	inputfiles = glob.glob("ICNALE_texts//.txt")
	savedir = "ICNALE_analysis"
	storeall = True
	storage = []
	os.makedirs(os.path.join("ICNALE_analysis", modelname))


	doc_level_storage = []

	for file in inputfiles:

	filename = os.path.split(file)[-1]

	with open(file, "r") as f:
	text = f.read()

	text = preprocess(text)
	doc = nlp(text)
	cleanup_justify(doc, doc.spans["sc"])
	delete_overlapping_span(doc.spans['sc'])

	data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
	seq = [s for s in doc.spans["sc"]]
	span_ngrams = ngrammar(seq=seq, n=3)


	### Make it a dataset
	df = pd.DataFrame(data, columns=cols)
	df = df.astype({"start": int, "end": int}) #convert col type
	df = df.sort_values(by= ['start']) #and sort by start
	# constant_value = 42
	new_col = pd.Series([filename] * df.shape[0], name='filename')

	doclen = len(doc)
	doc_len = pd.Series([doclen] * df.shape[0], name='nwords')

	df.insert(0, "filename", new_col, True)
	df.insert(1, "nwords", doc_len, True)
	df.to_csv(f"{savedir}/{modelname}/{filename}.csv")

	sequences = list(df['label_'])
	# Engagement ngrams
	span_bigrams = ngrammar(seq=seq, n=2)
	bidf = pd.DataFrame(span_bigrams)

	# constant_value = 42
	new_col = pd.Series([filename] * bidf.shape[0], name='filename')
	bidf = bidf.insert(0, "filename", new_col, True)


	## Document level
	doc_level = {}
	counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)
	div = diversity_values(list(counts))

	div_data = pd.DataFrame.from_dict(div, orient='index')

	doc_data = pd.concat([counts, div_data], axis = 0).T
	doc_data.insert(0, "filename", filename, True)
	doc_data.insert(1, "nwords", doc_len, True)
	doc_data.to_csv(f"{savedir}/{modelname}/ddata_{filename}.csv")

	if storeall:
	storage.append(df)
	doc_level_storage.append(doc_data)


	alldf = pd.concat(storage)

	alldf.to_csv(f"{savedir}/0_{modelname}_20230726.csv")


	# alldoc = pd.concat(doc_level_storage)
	# alldoc.to_csv(f"{savedir}/1_{modelname}_doc_20230426.csv")