Upload tokenizer

Browse files

Files changed (5) hide show

README.md +199 -0
special_tokens_map.json +6 -0
tokenizer.json +369 -0
tokenizer_config.json +44 -0
vocab.json +1 -0

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "UTT_BOUNDARY",
+  "eos_token": "UTT_BOUNDARY",
+  "pad_token": "PAD",
+  "unk_token": "UNK"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,369 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "UNK",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "PAD",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "W",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "UTT_BOUNDARY",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": {
+    "type": "Sequence",
+    "normalizers": [
+      {
+        "type": "NFD"
+      },
+      {
+        "type": "Lowercase"
+      },
+      {
+        "type": "Strip",
+        "strip_left": true,
+        "strip_right": true
+      },
+      {
+        "type": "StripAccents"
+      },
+      {
+        "type": "Replace",
+        "pattern": {
+          "String": " "
+        },
+        "content": "W"
+      }
+    ]
+  },
+  "pre_tokenizer": {
+    "type": "Split",
+    "pattern": {
+      "String": ""
+    },
+    "behavior": "Isolated",
+    "invert": false
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "UTT_BOUNDARY",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "SpecialToken": {
+          "id": "UTT_BOUNDARY",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "UTT_BOUNDARY",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "UTT_BOUNDARY": {
+        "id": "UTT_BOUNDARY",
+        "ids": [
+          3
+        ],
+        "tokens": [
+          "UTT_BOUNDARY"
+        ]
+      }
+    }
+  },
+  "decoder": null,
+  "model": {
+    "type": "WordLevel",
+    "vocab": {
+      "UNK": 0,
+      "PAD": 1,
+      "W": 2,
+      "UTT_BOUNDARY": 3,
+      "d": 4,
+      "o": 5,
+      "y": 6,
+      "u": 7,
+      "w": 8,
+      "a": 9,
+      "n": 10,
+      "t": 11,
+      "l": 12,
+      "k": 13,
+      "h": 14,
+      "i": 15,
+      "s": 16,
+      "?": 17,
+      "e": 18,
+      ".": 19,
+      "r": 20,
+      "'": 21,
+      "f": 22,
+      "c": 23,
+      "g": 24,
+      "p": 25,
+      "b": 26,
+      "m": 27,
+      "v": 28,
+      "j": 29,
+      "!": 30,
+      "x": 31,
+      "q": 32,
+      "z": 33,
+      "-": 34,
+      "&": 35,
+      ",": 36,
+      "/": 37,
+      "1": 38,
+      "9": 39,
+      "5": 40,
+      "0": 41,
+      ";": 42,
+      "‘": 43,
+      "’": 44,
+      "—": 45,
+      ":": 46,
+      "+": 47,
+      "8": 48,
+      "3": 49,
+      "7": 50,
+      "4": 51,
+      "6": 52,
+      "2": 53,
+      "=": 54,
+      ")": 55,
+      "(": 56,
+      "_": 57,
+      "*": 58,
+      "£": 59,
+      "–": 60,
+      "#": 61,
+      "`": 62,
+      "\"": 63,
+      "æ": 64,
+      "]": 65,
+      "|": 66,
+      "$": 67,
+      "“": 68,
+      "”": 69,
+      "[": 70,
+      "œ": 71,
+      "{": 72,
+      "}": 73,
+      "…": 74,
+      "°": 75,
+      "§": 76,
+      ">": 77,
+      "·": 78,
+      "¢": 79,
+      "%": 80,
+      "^": 81,
+      "½": 82,
+      "¶": 83,
+      "×": 84,
+      "¼": 85,
+      "¾": 86,
+      "φ": 87,
+      "<": 88,
+      "´": 89,
+      "¯": 90,
+      "¦": 91,
+      "": 92,
+      "«": 93,
+      "¬": 94,
+      "©": 95,
+      "": 96,
+      "●": 97,
+      "¹": 98,
+      "⁄": 99,
+      "²": 100,
+      "₂": 101,
+      "│": 102,
+      "δ": 103,
+      "α": 104,
+      "ο": 105,
+      "υ": 106,
+      "χ": 107,
+      "ι": 108,
+      "η": 109,
+      "μ": 110,
+      "ε": 111,
+      "γ": 112,
+      "λ": 113,
+      "β": 114,
+      "κ": 115,
+      "ω": 116,
+      "ν": 117,
+      "ρ": 118,
+      "σ": 119,
+      "ς": 120,
+      "τ": 121,
+      "θ": 122,
+      "π": 123,
+      "′": 124,
+      "³": 125,
+      "⸺": 126,
+      "―": 127,
+      "±": 128,
+      "~": 129,
+      "ß": 130,
+      "µ": 131,
+      "„": 132,
+      "@": 133,
+      "\\": 134,
+      "♪": 135,
+      "¿": 136,
+      "¡": 137,
+      "": 138,
+      "─": 139,
+      "": 140,
+      "ð": 141,
+      "": 142,
+      "º": 143,
+      "¸": 144,
+      "þ": 145,
+      "и": 146,
+      "♫": 147,
+      "¤": 148,
+      "¨": 149,
+      "ø": 150,
+      "": 151,
+      "ª": 152,
+      "đ": 153,
+      "": 154,
+      "": 155,
+      "€": 156,
+      "ﬂ": 157,
+      "♬": 158,
+      "": 159,
+      "，": 160,
+      "": 161,
+      "™": 162,
+      "®": 163,
+      "ı": 164,
+      "с": 165,
+      "в": 166,
+      "‎": 167,
+      "ц": 168,
+      "ь": 169,
+      "д": 170,
+      "н": 171,
+      "е": 172,
+      "м": 173,
+      "о": 174,
+      "т": 175,
+      "г": 176,
+      "а": 177,
+      "р": 178,
+      "ч": 179,
+      "п": 180,
+      "я": 181,
+      "б": 182,
+      "•": 183,
+      "ł": 184,
+      "ร": 185,
+      "ก": 186,
+      "к": 187,
+      "у": 188,
+      "": 189,
+      "": 190,
+      "": 191,
+      "": 192,
+      "": 193,
+      "": 194,
+      "": 195,
+      "大": 196,
+      "": 197,
+      "ا": 198,
+      "л": 199,
+      "з": 200,
+      "‑": 201,
+      "ʻ": 202,
+      "ت": 203,
+      "ו": 204,
+      "י": 205,
+      "ر": 206,
+      "ي": 207,
+      "ل": 208,
+      "ه": 209,
+      "ع": 210,
+      "ن": 211,
+      "ə": 212,
+      "−": 213,
+      "→": 214,
+      "ы": 215,
+      "ː": 216,
+      "و": 217,
+      "م": 218,
+      "س": 219,
+      "ح": 220,
+      "د": 221,
+      "ب": 222,
+      "ی": 223,
+      "»": 224,
+      "น": 225,
+      "า": 226,
+      "ง": 227,
+      "ม": 228,
+      "ʼ": 229,
+      "ˈ": 230
+    },
+    "unk_token": "UNK"
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "UNK",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "PAD",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "W",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "UTT_BOUNDARY",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "UTT_BOUNDARY",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "UTT_BOUNDARY",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "PAD",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "UNK"
+}

vocab.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"UNK":0,"PAD":1,"W":2,"UTT_BOUNDARY":3,"d":4,"o":5,"y":6,"u":7,"w":8,"a":9,"n":10,"t":11,"l":12,"k":13,"h":14,"i":15,"s":16,"?":17,"e":18,".":19,"r":20,"'":21,"f":22,"c":23,"g":24,"p":25,"b":26,"m":27,"v":28,"j":29,"!":30,"x":31,"q":32,"z":33,"-":34,"&":35,",":36,"/":37,"1":38,"9":39,"5":40,"0":41,";":42,"‘":43,"’":44,"—":45,":":46,"+":47,"8":48,"3":49,"7":50,"4":51,"6":52,"2":53,"=":54,")":55,"(":56,"_":57,"*":58,"£":59,"–":60,"#":61,"`":62,"\"":63,"æ":64,"]":65,"|":66,"$":67,"“":68,"”":69,"[":70,"œ":71,"{":72,"}":73,"…":74,"°":75,"§":76,">":77,"·":78,"¢":79,"%":80,"^":81,"½":82,"¶":83,"×":84,"¼":85,"¾":86,"φ":87,"<":88,"´":89,"¯":90,"¦":91,"":92,"«":93,"¬":94,"©":95,"":96,"●":97,"¹":98,"⁄":99,"²":100,"₂":101,"│":102,"δ":103,"α":104,"ο":105,"υ":106,"χ":107,"ι":108,"η":109,"μ":110,"ε":111,"γ":112,"λ":113,"β":114,"κ":115,"ω":116,"ν":117,"ρ":118,"σ":119,"ς":120,"τ":121,"θ":122,"π":123,"′":124,"³":125,"⸺":126,"―":127,"±":128,"~":129,"ß":130,"µ":131,"„":132,"@":133,"\\":134,"♪":135,"¿":136,"¡":137,"":138,"─":139,"":140,"ð":141,"":142,"º":143,"¸":144,"þ":145,"и":146,"♫":147,"¤":148,"¨":149,"ø":150,"":151,"ª":152,"đ":153,"":154,"":155,"€":156,"ﬂ":157,"♬":158,"":159,"，":160,"":161,"™":162,"®":163,"ı":164,"с":165,"в":166,"‎":167,"ц":168,"ь":169,"д":170,"н":171,"е":172,"м":173,"о":174,"т":175,"г":176,"а":177,"р":178,"ч":179,"п":180,"я":181,"б":182,"•":183,"ł":184,"ร":185,"ก":186,"к":187,"у":188,"":189,"":190,"":191,"":192,"":193,"":194,"":195,"大":196,"":197,"ا":198,"л":199,"з":200,"‑":201,"ʻ":202,"ت":203,"ו":204,"י":205,"ر":206,"ي":207,"ل":208,"ه":209,"ع":210,"ن":211,"ə":212,"−":213,"→":214,"ы":215,"ː":216,"و":217,"م":218,"س":219,"ح":220,"د":221,"ب":222,"ی":223,"»":224,"น":225,"า":226,"ง":227,"ม":228,"ʼ":229,"ˈ":230}