File size: 4,240 Bytes
5edd591
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import re
import os
import spacy_streamlit
from collections import Counter
import glob

import spacy
from spacy.tokens import Doc
from spacy.cli._util import import_code

from utils.visualize import visualize_spans
from utils.utility import preprocess, delete_overlapping_span, cleanup_justify

from resources.text_list import TEXT_LIST
from resources.template_list import TPL_SPAN, TPL_SPAN_SLICE, TPL_SPAN_START
from resources.colors import COLORS_1


from skbio import diversity as dv

from pipeline.post_processors import simple_table, const_table, ngrammar, diversity_values
import pandas as pd

# from pipeline.custom_functions import custom_functions
SPAN_ATTRS = ["text", "label_", "start", "end"]
CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"]


# spacy.prefer_gpu()

def load_model(spacy_model):
    # source = spacy.blank("en")
    nlp = spacy.load(spacy_model)  # , vocab=nlp_to_copy.vocab
    nlp.add_pipe('sentencizer')
    return (nlp)

# source = spacy.blank("en")

modelname = "en_engagement_LSTM_f3"
# modelname = "en_engagement_LSTM_f5"
# modelname = "en_engagement_Dual_RoBERTa_acad3_f4"

os.makedirs(os.path.join("/Users/masakieguchi/Dropbox/0_Projects/0_basenlp/SFLAnalyzer/engagement-analyzer-demo/results", modelname))

import_code("pipeline/custom_functions.py")

# nlp = spacy.load("en_engagement_three_RoBERTa_base_LSTM384")
nlp = spacy.load(modelname)
# doc = nlp(preprocess(TEXT_LIST[0]))

# cleanup_justify(doc, doc.spans["sc"])
# delete_overlapping_span(doc.spans['sc'])

# data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
# seq = [s for s in doc.spans["sc"]]
# span_ngrams = ngrammar(seq=seq, n=3)

# df = pd.DataFrame(data, columns=cols)

# constant_value = 42
# new_col = pd.Series([constant_value] * df.shape[0], name='new_col')

# doclen = len(doc)
# doc_len = pd.Series([doclen] * df.shape[0], name='nwords')

# df.insert(0, "new", new_col, True)
# df.insert(1, "nwords", doc_len, True)

# df.to_csv("results/test.csv")



# inputfiles = glob.glob("ECCE_texts/preprocessed/*.txt")
inputfiles = glob.glob("ICNALE_texts/*/*.txt")
savedir = "ICNALE_analysis"
storeall = True
storage = []
os.makedirs(os.path.join("ICNALE_analysis", modelname))


doc_level_storage = []

for file in inputfiles:

    filename = os.path.split(file)[-1]

    with open(file, "r") as f:
        text = f.read()

    text = preprocess(text)
    doc = nlp(text)
    cleanup_justify(doc, doc.spans["sc"])
    delete_overlapping_span(doc.spans['sc'])

    data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS)
    seq = [s for s in doc.spans["sc"]]
    span_ngrams = ngrammar(seq=seq, n=3)


    ### Make it a dataset
    df = pd.DataFrame(data, columns=cols)
    df = df.astype({"start": int, "end": int}) #convert col type
    df = df.sort_values(by= ['start']) #and sort by start
    # constant_value = 42
    new_col = pd.Series([filename] * df.shape[0], name='filename')

    doclen = len(doc)
    doc_len = pd.Series([doclen] * df.shape[0], name='nwords')

    df.insert(0, "filename", new_col, True)
    df.insert(1, "nwords", doc_len, True)
    df.to_csv(f"{savedir}/{modelname}/{filename}.csv")

    sequences = list(df['label_'])
    # Engagement ngrams
    span_bigrams = ngrammar(seq=seq, n=2)
    bidf = pd.DataFrame(span_bigrams)

    # constant_value = 42
    new_col = pd.Series([filename] * bidf.shape[0], name='filename')
    bidf = bidf.insert(0, "filename", new_col, True)


    ## Document level 
    doc_level = {}
    counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0)
    div = diversity_values(list(counts))

    div_data = pd.DataFrame.from_dict(div, orient='index')
    
    doc_data = pd.concat([counts, div_data], axis = 0).T
    doc_data.insert(0, "filename", filename, True)
    doc_data.insert(1, "nwords", doc_len, True)
    doc_data.to_csv(f"{savedir}/{modelname}/ddata_{filename}.csv")

    if storeall:
        storage.append(df)
        doc_level_storage.append(doc_data)


alldf = pd.concat(storage)

alldf.to_csv(f"{savedir}/0_{modelname}_20230726.csv")


# alldoc = pd.concat(doc_level_storage)
# alldoc.to_csv(f"{savedir}/1_{modelname}_doc_20230426.csv")