File size: 3,016 Bytes
68d26c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from deepmultilingualpunctuation import PunctuationModel
import re
import metrics
        
def remove_filler_words(transcript):
    
    # preserve line brakes
    transcript_hash = " # ".join(transcript.strip().splitlines())
    # preprocess the text by removing filler words
    # Define a list of filler words to remove
    filler_words = ["um", "uh", "hmm", "ha", "er", "ah", "yeah"]
    words = transcript_hash.split()
    clean_words = [word for word in words if word.lower() not in filler_words]
    input_text_clean = ' '.join(clean_words)
    # restore the line brakes
    input_text= input_text_clean.replace(' # ','\n')
    return input_text
    # Define a regular expression pattern that matches any filler word surrounded by whitespace or punctuation
    #pattern = r"(?<=\s|\b)(" + "|".join(fillers) + r")(?=\s|\b)"
    # Use re.sub to replace the filler words with empty strings
    #clean_input_text = re.sub(pattern, "", input_text)    

def predict(brakes, transcript):

    input_text = remove_filler_words(transcript)
    # Do the punctuation restauration
    model = PunctuationModel()
    output_text = model.restore_punctuation(input_text)

    # if any of the line brake methods are implemented,
    # return the text as a single line
    pcnt_file_cr = output_text

    if 'textlines' in brakes:

        # preserve line brakes
        srt_file_hash = '# '.join(input_text.strip().splitlines())
        #srt_file_sub=re.sub('\s*\n\s*','# ',srt_file_strip)
        srt_file_array=srt_file_hash.split()
        pcnt_file_array=output_text.split()

        # goal: restore the break points i.e. the same number of lines as the srt file
        # this is necessary, because each line in the srt file corresponds to a frame from the video
        if len(srt_file_array)!=len(pcnt_file_array):
            return "AssertError: The length of the transcript and the punctuated file should be the same: ",len(srt_file_array),len(pcnt_file_array)
        
        pcnt_file_array_hash = []
        for idx, item in enumerate(srt_file_array):
            if item.endswith('#'):
                pcnt_file_array_hash.append(pcnt_file_array[idx]+'#')
            else:
                pcnt_file_array_hash.append(pcnt_file_array[idx])
    
        # assemble the array back to a string
        pcnt_file_cr=' '.join(pcnt_file_array_hash).replace('#','\n')

    elif 'sentences' in brakes:
        split_text = output_text.split('. ')
        pcnt_file_cr = '.\n'.join(split_text)
    
    regex1 = r"\bi\b"
    regex2 = r"(?<=[.?!;])\s*\w"
    regex3 = r"^\w"
    pcnt_file_cr_cap = re.sub(regex3, lambda x: x.group().upper(), re.sub(regex2, lambda x: x.group().upper(), re.sub(regex1, "I", pcnt_file_cr)))

    metrics.load_nltk()
    n_tokens= metrics.num_tokens(pcnt_file_cr_cap)
    n_sents = metrics.num_sentences(pcnt_file_cr_cap)
    n_words = metrics.num_words(pcnt_file_cr_cap)
    n_chars = metrics.num_chars(pcnt_file_cr_cap)

    return pcnt_file_cr_cap, n_words, n_sents, n_chars, n_tokens