Freak-ppa's picture
Upload 31 files
ffd0e5b verified
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.tokenize import word_tokenize
import nltk
def nltk_speach_tag(sentence):
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("stopwords")
# Tokenize the sentence
tokens = word_tokenize(sentence)
# Filter out stopwords and punctuation
stop_words = set(stopwords.words("english"))
filtered_tokens = [
word for word in tokens if word.lower() not in stop_words and word.isalnum()
]
# Perform Part-of-Speech tagging
tagged_tokens = pos_tag(filtered_tokens)
# Extract nouns and proper nouns
salient_tokens = [
token
for token, pos in tagged_tokens
if pos in ["NN", "NNP", "NNS", "NNPS", "ADJ", "JJ", "FW"]
]
salient_tokens = list(set(salient_tokens))
# Re-add commas or periods relative to the original sentence
comma_period_indices = [i for i, char in enumerate(sentence) if char in [",", "."]]
salient_tokens_indices = [sentence.index(token) for token in salient_tokens]
# Add commas or periods between words if there was one in the original sentence
out = ""
for i, index in enumerate(salient_tokens_indices):
out += salient_tokens[i]
distance_between_next = (
salient_tokens_indices[i + 1] - index
if i + 1 < len(salient_tokens_indices)
else None
)
puncuated = False
if not distance_between_next:
puncuated = True
else:
for i in range(index, index + distance_between_next):
if i in comma_period_indices:
puncuated = True
break
if not puncuated:
# IF the previous word was an adjective, and current is a noun, add a space
if (
i > 0
and tagged_tokens[i - 1][1] in ["JJ", "ADJ"]
and tagged_tokens[i][1] in ["NN", "NNP", "NNS", "NNPS"]
):
out += " "
else:
out += ", "
else:
out += ". "
# Add the last token
out += sentence[-1]
# Print the salient tokens
return out.strip().strip(",").strip(".").strip()
def extract_keywords(text: str) -> str:
tokenizer = AutoTokenizer.from_pretrained("yanekyuk/bert-keyword-extractor")
model = AutoModelForTokenClassification.from_pretrained(
"yanekyuk/bert-keyword-extractor"
)
"""Return keywords from text using a BERT model trained for keyword extraction as
a comma-separated string."""
print(f"Extracting keywords from text: {text}")
for char in ["\n", "\t", "\r"]:
text = text.replace(char, " ")
sentences = text.split(".")
result = ""
for sentence in sentences:
print(f"Extracting keywords from sentence: {sentence}")
inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
logits = model(**inputs).logits
predicted_token_class_ids = logits.argmax(dim=-1)
predicted_keywords = []
for token_id, token in zip(
predicted_token_class_ids[0],
tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]),
):
if token_id == 1:
predicted_keywords.append(token)
print(f"Extracted keywords: {predicted_keywords}")
result += ", ".join(predicted_keywords) + ", "
print(f"All Keywords: {result}")
return result