Spaces:

somosnlp-hackathon-2022
/

Sexismdetection

Runtime error

App Files Files Community

robertou2 commited on Mar 30, 2022

Commit

45be029

1 Parent(s): 3175701

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -6

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import tweepy as tw
 import streamlit as st
 import pandas as pd
 import torch
@@ -27,21 +27,60 @@ auth = tw.OAuthHandler(consumer_key, consumer_secret)
 auth.set_access_token(access_token, access_token_secret)
 api = tw.API(auth, wait_on_rate_limit=True)
 st.title('Analisis de comentarios sexistas en Twitter con Tweepy and HuggingFace Transformers')
 st.markdown('Esta app utiliza tweepy para descargar tweets de twitter en base a la información de entrada y procesa los tweets usando transformers de HuggingFace para detectar comentarios sexistas. El resultado y los tweets correspondientes se almacenan en un dataframe para mostrarlo que es lo que se ve como resultado')
 def run():
-    with st.form(key='Introduzca nombre'):
-        search_words = st.text_input('Introduzca el termino para analizar')
         number_of_tweets = st.number_input('Introduzca número de twweets a analizar. Máximo 50', 0,50,10)
-        submit_button = st.form_submit_button(label='Submit')
         if submit_button:
-            tweets =tw.Cursor(api.search_tweets,q=search_words).items(number_of_tweets)
             tweet_list = [i.text for i in tweets]
             text= pd.DataFrame(tweet_list)
             text1=text[0].values
             indices1=tokenizer.batch_encode_plus(text1.tolist(),
                                      max_length=128,

+iimport tweepy as tw
 import streamlit as st
 import pandas as pd
 import torch
 auth.set_access_token(access_token, access_token_secret)
 api = tw.API(auth, wait_on_rate_limit=True)
+def preprocess(text):
+    text=text.lower()
+    # remove hyperlinks
+    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
+    text = re.sub(r'http?:\/\/.*[\r\n]*', '', text)
+    #Replace &amp, &lt, &gt with &,<,> respectively
+    text=text.replace(r'&amp;?',r'and')
+    text=text.replace(r'&lt;',r'<')
+    text=text.replace(r'&gt;',r'>')
+    #remove hashtag sign
+    #text=re.sub(r"#","",text)
+    #remove mentions
+    text = re.sub(r"(?:\@)\w+", '', text)
+    #text=re.sub(r"@","",text)
+    #remove non ascii chars
+    text=text.encode("ascii",errors="ignore").decode()
+    #remove some puncts (except . ! ?)
+    text=re.sub(r'[:"#$%&\*+,-/:;<=>@\\^_`{|}~]+','',text)
+    text=re.sub(r'[!]+','!',text)
+    text=re.sub(r'[?]+','?',text)
+    text=re.sub(r'[.]+','.',text)
+    text=re.sub(r"'","",text)
+    text=re.sub(r"\(","",text)
+    text=re.sub(r"\)","",text)
+    text=" ".join(text.split())
+    return text
 st.title('Analisis de comentarios sexistas en Twitter con Tweepy and HuggingFace Transformers')
 st.markdown('Esta app utiliza tweepy para descargar tweets de twitter en base a la información de entrada y procesa los tweets usando transformers de HuggingFace para detectar comentarios sexistas. El resultado y los tweets correspondientes se almacenan en un dataframe para mostrarlo que es lo que se ve como resultado')
 def run():
+    with st.form(key='Introduzca Texto'):
+        search_words = st.text_input('Introduzca el termino o usuario para analizar y pulse el check ')
         number_of_tweets = st.number_input('Introduzca número de twweets a analizar. Máximo 50', 0,50,10)
+        termino=st.checkbox('Término')
+        usuario=st.checkbox('Usuario')
+        submit_button = st.form_submit_button(label='Analizar')
         if submit_button:
+            date_since = "2020-09-14"
+            if (termino):
+                new_search = search_words + " -filter:retweets"
+                tweets =tw.Cursor(api.search_tweets,q=new_search,lang="es",since=date_since).items(number_of_tweets)
+            elif (usuario):
+                tweets = api.user_timeline(screen_name = search_words,count=number_of_tweets)
+            #new_search = search_words + " -filter:retweets"
+            #tweets = tweepy.Cursor(api.search,q=new_search,lang="es",since=date_since).items(number_of_tweets)
+            #tweets =tw.Cursor(api.search_tweets,q=search_words).items(number_of_tweets)
+            #tweets =tw.Cursor(api.search_tweets,q=new_search,lang="es",since=date_since).items(number_of_tweets)
             tweet_list = [i.text for i in tweets]
+            #tweet_list = [strip_undesired_chars(i.text) for i in tweets]
             text= pd.DataFrame(tweet_list)
+            text[0] = text[0].apply(preprocess)
             text1=text[0].values
             indices1=tokenizer.batch_encode_plus(text1.tolist(),
                                      max_length=128,