import gradio as gr import pandas as pd from pytube import extract import re import string import pickle import nltk import nltk.sentiment.util from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from tensorflow import keras from youtube_comment_downloader import * nltk.download('punkt') nltk.download('wordnet') # get YouTube ID def getID(url): print("Getting YouTube ID...") return extract.video_id(url) # function to clean comments def clean_text(text): lemmatizer = WordNetLemmatizer() # stopwords sw = ["i","me","my","myself","we","our","ours","ourselves","you","you're","you've","you'll","you'd","your","yours","yourself","yourselves","he","him","his","himself","she","she's","her","hers","herself","it","it's","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","that'll","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","don't","should","should've","now","d","ll","m","o","re","ve","y","ain","aren","aren't","couldn","couldn't","didn","didn't","doesn","doesn't","hadn","hadn't","hasn","hasn't","haven","haven't","isn","isn't","ma","mightn","mightn't","mustn","mustn't","needn","needn't","shan","shan't","shouldn","shouldn't","wasn","wasn't","weren","weren't","won","won't","wouldn","wouldn't"] # remove symbols and Emojis text = text.lower() text = re.sub('@', '', text) text = re.sub('\[.*?\]', '', text) text = re.sub('https?://\S+|www\.\S+', '', text) text = re.sub('<.*?>+', '', text) text = re.sub('[%s]' % re.escape(string.punctuation), '', text) text = re.sub('\n', '', text) text = re.sub('\w*\d\w*', '', text) text = re.sub(r"[^a-zA-Z ]+", "", text) # tokenize the data text = nltk.word_tokenize(text) # lemmatize text = [lemmatizer.lemmatize(t) for t in text] text = [lemmatizer.lemmatize(t, 'v') for t in text] # mark Negation tokens_neg_marked = nltk.sentiment.util.mark_negation(text) # remove stopwords text = [t for t in tokens_neg_marked if t.replace("_NEG", "").isalnum() and t.replace("_NEG", "") not in sw] return text def getSentenceTrain(): # open sentences_train file sentences_train_f = open('Deep learning/pickles/sentences_train.pickle', "rb") sentences_train = pickle.load(sentences_train_f) sentences_train_f.close() return sentences_train SGD_74_f = open('Shallow machine learning/pickles/SGD_74.pickle', "rb") SGD_train = pickle.load(SGD_74_f) SGD_74_f.close() logreg_79_f = open('Shallow machine learning/pickles/logreg_79.pickle', "rb") logreg_train = pickle.load(logreg_79_f) logreg_79_f.close() # get saved CNN model model = keras.models.load_model("Deep learning/CNN_82") def vote(test_point, _test): print("Voting on video effectivess...\n") pos_weighting = [] result = '' confidence = 0 algos_score = 0 algorithms = [ {'name': 'SGD', 'accuracy': 0.74*100, 'trained': SGD_train}, {'name': 'Logistic Regression', 'accuracy': 0.79*100, 'trained': logreg_train}, {'name': 'CNN', 'accuracy': 0.82*100, 'trained': model} ] for algo in algorithms: weight = algo['accuracy'] algos_score += weight if algo['name'] == "CNN": pred = algo['trained'].predict(_test) if pred[0][0] > 0.5: pos_weighting.append(weight) print("CNN voted for: effective" if pred[0][0]>0.5 else "CNN voted for: ineffective") else: pred = algo['trained'].predict(test_point) if pred == 'pos': pos_weighting.append(weight) print(algo['name'] + " voted for: effective" if pred=='pos' else algo['name'] + " voted for: ineffective") pos_result = sum(pos_weighting)/algos_score if pos_result < 0.5: result = 'ineffective' confidence = 1 - pos_result else: result = 'effective' confidence = pos_result return result, confidence def quantizeEffectiveness(url): # 1. Get YouTube ID print("Getting YouTube ID...") videoId = getID(url) # 2. Download comments print("Downloading comments...") downloader = YoutubeCommentDownloader() comments_downloaded = downloader.get_comments_from_url(f'https://www.youtube.com/watch?v={videoId}') comments = [comment for comment in comments_downloaded] comments_df = pd.DataFrame(comments) # 3. Clean comments print("Cleaning Comments...") comments_df['text'] = comments_df['text'].apply(lambda x: clean_text(x)) # get all words of video into one list all_words = [item for sublist in comments_df['text'].tolist() for item in sublist] # 4. Create test dataframe test = pd.DataFrame([[videoId]], columns=['VideoId']) # 5. Get documents (pre-processd comments) test_documents = [] test_documents.append(all_words) test['cleaned'] = test_documents test['cleaned_string'] = [' '.join(map(str, l)) for l in test['cleaned']] # 6. Get ML test point test_point = test['cleaned_string'] test_sentence = test['cleaned_string'].values # 7. Get trained sentences sentences_train = getSentenceTrain() # 8. Tokenize the data print("Tokenizing the data...") tokenizer = Tokenizer(num_words=5000) tokenizer.fit_on_texts(sentences_train) # 9. Get DL test point _test = pad_sequences(tokenizer.texts_to_sequences(test_sentence), padding='post', maxlen=100) # 10. Vote on video effectiveness result, confidence = vote(test_point, _test) return result, confidence def is_valid_youtube_url(text): youtube_regex = re.compile(r"^(https?://)?(www\.)?(youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/|youtube\.com/v/|youtube\.com/user/[^/]+/u/)?([^/&?=\s]{11})$") return bool(youtube_regex.match(text)) def greet(url): if not is_valid_youtube_url(url): return "Please input a valid YouTube URL" result, confidence = quantizeEffectiveness(url) return f"The video (ID: {getID(url)}) is {result} with a confidence of {round(confidence*100,2)}%" iface = gr.Interface(fn=greet, inputs="text", outputs="text") iface.launch()