|
from transformers import PretrainedConfig |
|
from nltk.corpus import stopwords |
|
from typing import List |
|
import nltk |
|
nltk.download('stopwords') |
|
nltk.download('punkt') |
|
|
|
class GZIPEmbeddingConfig(PretrainedConfig): |
|
model_type = "gzipembed" |
|
def __init__( |
|
self, |
|
normalize = True, |
|
normalized_corpus = True, |
|
reduction = False, |
|
reduced_dimension = 0, |
|
remove_stop_words = True, |
|
stop_words = stopwords.words('english'), |
|
corpus = [], |
|
**kwargs, |
|
): |
|
self.corpus = corpus |
|
self.normalize = normalize |
|
self.normalized_corpus = normalized_corpus |
|
self.reduction = reduction |
|
self.reduced_dimension = reduced_dimension, |
|
self.remove_stop_words = remove_stop_words |
|
self.stop_words = stop_words |
|
super().__init__(**kwargs) |
|
|