# #using pipeline to predict the input text # import pandas as pd # from transformers import pipeline, AutoTokenizer # import pysbd # #-----------------Outcome Prediction----------------- # def outcome(text): # label_mapping = { # 'delete': [0, 'LABEL_0'], # 'keep': [1, 'LABEL_1'], # 'merge': [2, 'LABEL_2'], # 'no consensus': [3, 'LABEL_3'], # 'speedy keep': [4, 'LABEL_4'], # 'speedy delete': [5, 'LABEL_5'], # 'redirect': [6, 'LABEL_6'], # 'withdrawn': [7, 'LABEL_7'] # } # model_name = "research-dump/roberta-large_deletion_multiclass_complete_final" # tokenizer = AutoTokenizer.from_pretrained(model_name) # model = pipeline("text-classification", model=model_name, return_all_scores=True) # # Tokenize and truncate the text # tokens = tokenizer(text, truncation=True, max_length=512) # truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True) # results = model(truncated_text) # res_list = [] # for result in results[0]: # for key, value in label_mapping.items(): # if result['label'] == value[1]: # res_list.append({'sentence': truncated_text, 'outcome': key, 'score': result['score']}) # break # return res_list # #-----------------Stance Prediction----------------- # def extract_response(text, model_name, label_mapping): # tokenizer = AutoTokenizer.from_pretrained(model_name) # pipe = pipeline("text-classification", model=model_name, tokenizer=tokenizer, top_k=None) # tokens = tokenizer(text, truncation=True, max_length=512) # truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True) # results = pipe(truncated_text) # final_scores = {key: 0.0 for key in label_mapping} # for result in results[0]: # for key, value in label_mapping.items(): # if result['label'] == f'LABEL_{value}': # final_scores[key] = result['score'] # break # return final_scores # def get_stance(text): # label_mapping = { # 'delete': 0, # 'keep': 1, # 'merge': 2, # 'comment': 3 # } # seg = pysbd.Segmenter(language="en", clean=False) # text_list = seg.segment(text) # model = 'research-dump/bert-large-uncased_wikistance_v1' # res_list = [] # for t in text_list: # res = extract_response(t, model,label_mapping) #, access_token) # highest_key = max(res, key=res.get) # highest_score = res[highest_key] # result = {'sentence':t,'stance': highest_key, 'score': highest_score} # res_list.append(result) # return res_list # #-----------------Policy Prediction----------------- # def get_policy(text): # label_mapping = {'Wikipedia:Notability': 0, # 'Wikipedia:What Wikipedia is not': 1, # 'Wikipedia:Neutral point of view': 2, # 'Wikipedia:Verifiability': 3, # 'Wikipedia:Wikipedia is not a dictionary': 4, # 'Wikipedia:Wikipedia is not for things made up one day': 5, # 'Wikipedia:Criteria for speedy deletion': 6, # 'Wikipedia:Deletion policy': 7, # 'Wikipedia:No original research': 8, # 'Wikipedia:Biographies of living persons': 9, # 'Wikipedia:Arguments to avoid in deletion discussions': 10, # 'Wikipedia:Conflict of interest': 11, # 'Wikipedia:Articles for deletion': 12 # } # seg = pysbd.Segmenter(language="en", clean=False) # text_list = seg.segment(text) # model = 'research-dump/bert-large-uncased_wikistance_policy_v1' # res_list = [] # for t in text_list: # res = extract_response(t, model,label_mapping) # highest_key = max(res, key=res.get) # highest_score = res[highest_key] # result = {'sentence': t, 'policy': highest_key, 'score': highest_score} # res_list.append(result) # return res_list # #-----------------Sentiment Analysis----------------- # def extract_highest_score_label(res): # flat_res = [item for sublist in res for item in sublist] # highest_score_item = max(flat_res, key=lambda x: x['score']) # highest_score_label = highest_score_item['label'] # highest_score_value = highest_score_item['score'] # return highest_score_label, highest_score_value # def get_sentiment(text): # #sentiment analysis # model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest" # tokenizer = AutoTokenizer.from_pretrained(model_name) # model = pipeline("text-classification", model=model_name, top_k= None) # #sentence tokenize the text using pysbd # seg = pysbd.Segmenter(language="en", clean=False) # text_list = seg.segment(text) # res = [] # for t in text_list: # results = model(t) # highest_label, highest_score = extract_highest_score_label(results) # result = {'sentence': t,'sentiment': highest_label, 'score': highest_score} # res.append(result) # return res # #-----------------Toxicity Prediction----------------- # def get_offensive_label(text): # #offensive language detection model # model_name = "cardiffnlp/twitter-roberta-base-offensive" # tokenizer = AutoTokenizer.from_pretrained(model_name) # model = pipeline("text-classification", model=model_name, top_k= None) # #sentence tokenize the text using pysbd # seg = pysbd.Segmenter(language="en", clean=False) # text_list = seg.segment(text) # res = [] # for t in text_list: # results = model(t) # highest_label, highest_score = extract_highest_score_label(results) # result = {'sentence': t,'offensive_label': highest_label, 'score': highest_score} # res.append(result) # return res # #create the anchor function # def predict_text(text, model_name): # if model_name == 'outcome': # return outcome(text) # elif model_name == 'stance': # return get_stance(text) # elif model_name == 'policy': # return get_policy(text) # elif model_name == 'sentiment': # return get_sentiment(text) # elif model_name == 'offensive': # return get_offensive_label(text) # else: # return "Invalid model name" import pandas as pd from transformers import pipeline, AutoTokenizer import pysbd import torch label_mapping_wikipedia_en = { 'delete': [0, 'LABEL_0'], 'keep': [1, 'LABEL_1'], 'merge': [2, 'LABEL_2'], 'no consensus': [3, 'LABEL_3'], 'speedy keep': [4, 'LABEL_4'], 'speedy delete': [5, 'LABEL_5'], 'redirect': [6, 'LABEL_6'], 'withdrawn': [7, 'LABEL_7'] } label_mapping_es = { 'Borrar': [0, 'LABEL_0'], 'Mantener': [1, 'LABEL_1'], 'Fusionar': [2, 'LABEL_2'], 'Otros': [3, 'LABEL_3'] } label_mapping_gr = { 'Διαγραφή': [0, 'LABEL_0'], 'Δεν υπάρχει συναίνεση': [1, 'LABEL_1'], 'Διατήρηση': [2, 'LABEL_2'], 'συγχώνευση': [3, 'LABEL_3'] } label_mapping_wikidata_ent = { 'delete': [0, 'LABEL_0'], 'no_consensus': [1, 'LABEL_1'], 'merge': [2, 'LABEL_2'], 'keep': [3, 'LABEL_3'], 'comment': [4, 'LABEL_4'], 'redirect': [5, 'LABEL_5'] } label_mapping_wikidata_prop = { 'deleted': [0, 'LABEL_0'], 'keep': [1, 'LABEL_1'], 'no_consensus': [2, 'LABEL_2'] } label_mapping_wikinews = { 'delete': [0, 'LABEL_0'], 'no_consensus': [1, 'LABEL_1'], 'speedy delete': [2, 'LABEL_2'], 'keep': [3, 'LABEL_3'], 'redirect': [4, 'LABEL_4'], 'comment': [5, 'LABEL_5'], 'merge': [6, 'LABEL_6'], 'withdrawn': [7, 'LABEL_7'] } label_mapping_wikiquote = { 'merge': [0, 'LABEL_0'], 'keep': [1, 'LABEL_1'], 'no_consensus': [2, 'LABEL_2'], 'redirect': [3, 'LABEL_3'], 'delete': [4, 'LABEL_4'] } best_models_tasks = { 'wikipedia': 'research-dump/roberta-large_deletion_multiclass_complete_final_v2', 'wikidata_entity': 'research-dump/roberta-large_wikidata_ent_outcome_prediction_v1', 'wikidata_property': 'research-dump/roberta-large_wikidata_prop_outcome_prediction_v1', 'wikinews': 'research-dump/all-roberta-large-v1_wikinews_outcome_prediction_v1', 'wikiquote': 'research-dump/roberta-large_wikiquote_outcome_prediction_v1' } best_models_langs = { 'en': 'research-dump/roberta-large_deletion_multiclass_complete_final_v2', 'es': 'research-dump/xlm-roberta-large_deletion_multiclass_es', 'gr': 'research-dump/xlm-roberta-large_deletion_multiclass_gr' } #-----------------Outcome Prediction----------------- def outcome(text, lang='en', platform='wikipedia', date='', years=None): if lang == 'en': if platform not in best_models_tasks: raise ValueError(f"For lang='en', platform must be one of {list(best_models_tasks.keys())}") model_name = best_models_tasks[platform] if platform == 'wikipedia': label_mapping = label_mapping_wikipedia_en elif platform == 'wikidata_entity': label_mapping = label_mapping_wikidata_ent elif platform == 'wikidata_property': label_mapping = label_mapping_wikidata_prop elif platform == 'wikinews': label_mapping = label_mapping_wikinews elif platform == 'wikiquote': label_mapping = label_mapping_wikiquote elif lang in ['es', 'gr']: if platform != 'wikipedia': raise ValueError(f"For lang='{lang}', only platform='wikipedia' is supported.") model_name = best_models_langs[lang] label_mapping = label_mapping_es if lang == 'es' else label_mapping_gr else: raise ValueError("Invalid lang. Use 'en', 'es', or 'gr'.") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = AutoTokenizer.from_pretrained(model_name) model = pipeline("text-classification", model=model_name, return_all_scores=True, device=device) tokens = tokenizer(text, truncation=True, max_length=512) truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True) results = model(truncated_text) res_list = [] for result in results[0]: for key, value in label_mapping.items(): if result['label'] == value[1]: res_list.append({'sentence': truncated_text, 'outcome': key, 'score': result['score']}) break return res_list def extract_response(text, model_name, label_mapping): tokenizer = AutoTokenizer.from_pretrained(model_name) pipe = pipeline("text-classification", model=model_name, tokenizer=tokenizer, top_k=None) tokens = tokenizer(text, truncation=True, max_length=512) truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True) results = pipe(truncated_text) final_scores = {key: 0.0 for key in label_mapping} for result in results[0]: for key, value in label_mapping.items(): if result['label'] == f'LABEL_{value}': final_scores[key] = result['score'] break return final_scores #-----------------Stance Detection----------------- def get_stance(text): label_mapping = { 'delete': 0, 'keep': 1, 'merge': 2, 'comment': 3 } seg = pysbd.Segmenter(language="en", clean=False) text_list = seg.segment(text) model = 'research-dump/bert-large-uncased_wikistance_v1' res_list = [] for t in text_list: res = extract_response(t, model,label_mapping) #, access_token) highest_key = max(res, key=res.get) highest_score = res[highest_key] result = {'sentence':t,'stance': highest_key, 'score': highest_score} res_list.append(result) return res_list #-----------------Policy Prediction----------------- def get_policy(text): label_mapping = {'Wikipedia:Notability': 0, 'Wikipedia:What Wikipedia is not': 1, 'Wikipedia:Neutral point of view': 2, 'Wikipedia:Verifiability': 3, 'Wikipedia:Wikipedia is not a dictionary': 4, 'Wikipedia:Wikipedia is not for things made up one day': 5, 'Wikipedia:Criteria for speedy deletion': 6, 'Wikipedia:Deletion policy': 7, 'Wikipedia:No original research': 8, 'Wikipedia:Biographies of living persons': 9, 'Wikipedia:Arguments to avoid in deletion discussions': 10, 'Wikipedia:Conflict of interest': 11, 'Wikipedia:Articles for deletion': 12 } seg = pysbd.Segmenter(language="en", clean=False) text_list = seg.segment(text) model = 'research-dump/bert-large-uncased_wikistance_policy_v1' res_list = [] for t in text_list: res = extract_response(t, model,label_mapping) highest_key = max(res, key=res.get) highest_score = res[highest_key] result = {'sentence': t, 'policy': highest_key, 'score': highest_score} res_list.append(result) return res_list #-----------------Sentiment Analysis----------------- def extract_highest_score_label(res): flat_res = [item for sublist in res for item in sublist] highest_score_item = max(flat_res, key=lambda x: x['score']) highest_score_label = highest_score_item['label'] highest_score_value = highest_score_item['score'] return highest_score_label, highest_score_value def get_sentiment(text): #sentiment analysis model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest" tokenizer = AutoTokenizer.from_pretrained(model_name) model = pipeline("text-classification", model=model_name, top_k= None) #sentence tokenize the text using pysbd seg = pysbd.Segmenter(language="en", clean=False) text_list = seg.segment(text) res = [] for t in text_list: results = model(t) highest_label, highest_score = extract_highest_score_label(results) result = {'sentence': t,'sentiment': highest_label, 'score': highest_score} res.append(result) return res #-----------------Toxicity Prediction----------------- def get_offensive_label(text): #offensive language detection model model_name = "cardiffnlp/twitter-roberta-base-offensive" tokenizer = AutoTokenizer.from_pretrained(model_name) model = pipeline("text-classification", model=model_name, top_k= None) #sentence tokenize the text using pysbd seg = pysbd.Segmenter(language="en", clean=False) text_list = seg.segment(text) res = [] for t in text_list: results = model(t) highest_label, highest_score = extract_highest_score_label(results) result = {'sentence': t,'offensive_label': highest_label, 'score': highest_score} res.append(result) return res def predict_text(text, model_name, lang='en', platform='wikipedia', date='', years=None): if model_name == 'outcome': return outcome(text, lang=lang, platform=platform, date=date, years=years) elif model_name == 'stance': return get_stance(text) elif model_name == 'policy': return get_policy(text) elif model_name == 'sentiment': return get_sentiment(text) elif model_name == 'offensive': return get_offensive_label(text) else: return "Invalid model name"