nizar-sayad commited on
Commit
ef14c81
·
0 Parent(s):

Duplicate from cardiffnlp/twitter-roberta-base-sentiment-latest

Browse files
Files changed (8) hide show
  1. .gitattributes +27 -0
  2. README.md +87 -0
  3. config.json +37 -0
  4. merges.txt +0 -0
  5. pytorch_model.bin +3 -0
  6. special_tokens_map.json +1 -0
  7. tf_model.h5 +3 -0
  8. vocab.json +0 -0
.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ widget:
4
+ - text: Covid cases are increasing fast!
5
+ datasets:
6
+ - tweet_eval
7
+ duplicated_from: cardiffnlp/twitter-roberta-base-sentiment-latest
8
+ ---
9
+
10
+
11
+ # Twitter-roBERTa-base for Sentiment Analysis - UPDATED (2022)
12
+
13
+ This is a RoBERTa-base model trained on ~124M tweets from January 2018 to December 2021, and finetuned for sentiment analysis with the TweetEval benchmark.
14
+ The original Twitter-based RoBERTa model can be found [here](https://huggingface.co/cardiffnlp/twitter-roberta-base-2021-124m) and the original reference paper is [TweetEval](https://github.com/cardiffnlp/tweeteval). This model is suitable for English.
15
+
16
+ - Reference Paper: [TimeLMs paper](https://arxiv.org/abs/2202.03829).
17
+ - Git Repo: [TimeLMs official repository](https://github.com/cardiffnlp/timelms).
18
+
19
+ <b>Labels</b>:
20
+ 0 -> Negative;
21
+ 1 -> Neutral;
22
+ 2 -> Positive
23
+
24
+ This sentiment analysis model has been integrated into [TweetNLP](https://github.com/cardiffnlp/tweetnlp). You can access the demo [here](https://tweetnlp.org).
25
+
26
+ ## Example Pipeline
27
+ ```python
28
+ from transformers import pipeline
29
+ sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)
30
+ sentiment_task("Covid cases are increasing fast!")
31
+ ```
32
+ ```
33
+ [{'label': 'Negative', 'score': 0.7236}]
34
+ ```
35
+
36
+ ## Full classification example
37
+
38
+ ```python
39
+ from transformers import AutoModelForSequenceClassification
40
+ from transformers import TFAutoModelForSequenceClassification
41
+ from transformers import AutoTokenizer, AutoConfig
42
+ import numpy as np
43
+ from scipy.special import softmax
44
+ # Preprocess text (username and link placeholders)
45
+ def preprocess(text):
46
+ new_text = []
47
+ for t in text.split(" "):
48
+ t = '@user' if t.startswith('@') and len(t) > 1 else t
49
+ t = 'http' if t.startswith('http') else t
50
+ new_text.append(t)
51
+ return " ".join(new_text)
52
+ MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
53
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
54
+ config = AutoConfig.from_pretrained(MODEL)
55
+ # PT
56
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL)
57
+ #model.save_pretrained(MODEL)
58
+ text = "Covid cases are increasing fast!"
59
+ text = preprocess(text)
60
+ encoded_input = tokenizer(text, return_tensors='pt')
61
+ output = model(**encoded_input)
62
+ scores = output[0][0].detach().numpy()
63
+ scores = softmax(scores)
64
+ # # TF
65
+ # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
66
+ # model.save_pretrained(MODEL)
67
+ # text = "Covid cases are increasing fast!"
68
+ # encoded_input = tokenizer(text, return_tensors='tf')
69
+ # output = model(encoded_input)
70
+ # scores = output[0][0].numpy()
71
+ # scores = softmax(scores)
72
+ # Print labels and scores
73
+ ranking = np.argsort(scores)
74
+ ranking = ranking[::-1]
75
+ for i in range(scores.shape[0]):
76
+ l = config.id2label[ranking[i]]
77
+ s = scores[ranking[i]]
78
+ print(f"{i+1}) {l} {np.round(float(s), 4)}")
79
+ ```
80
+
81
+ Output:
82
+
83
+ ```
84
+ 1) Negative 0.7236
85
+ 2) Neutral 0.2287
86
+ 3) Positive 0.0477
87
+ ```
config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/jupyter/misc/tweeteval/TweetEval_models/sentiment/sentiment_latest_2021/",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "negative",
16
+ "1": "neutral",
17
+ "2": "positive"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 3072,
21
+ "label2id": {
22
+ "negative": 0,
23
+ "neutral": 1,
24
+ "positive": 2
25
+ },
26
+ "layer_norm_eps": 1e-05,
27
+ "max_position_embeddings": 514,
28
+ "model_type": "roberta",
29
+ "num_attention_heads": 12,
30
+ "num_hidden_layers": 12,
31
+ "pad_token_id": 1,
32
+ "position_embedding_type": "absolute",
33
+ "torch_dtype": "float32",
34
+ "transformers_version": "4.13.0.dev0",
35
+ "type_vocab_size": 1,
36
+ "vocab_size": 50265
37
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d24a3e32a88ed1c4e5b789fc6644e2e767500554e954b27dccf52a8e762cbae
3
+ size 501045531
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:682358ffb3869b08a144d5e59325534335729720fe64d5f2b3a543f8e5d14a9e
3
+ size 498845224
vocab.json ADDED
The diff for this file is too large to render. See raw diff