Spaces:
Running
on
Zero
Running
on
Zero
import tiktoken | |
def normalize_text(text: str) -> str: | |
"""Normalize text for TTS processing""" | |
if not text: | |
return "" | |
# Basic normalization - can be expanded based on needs | |
return text.strip() | |
def chunk_text(text: str, max_chars: int = 400) -> list[str]: | |
"""Break text into chunks at natural boundaries""" | |
chunks = [] | |
current_chunk = "" | |
# Split on sentence boundaries first | |
sentences = text.replace(".", ".|").replace("!", "!|").replace("?", "?|").replace(";", ";|").split("|") | |
for sentence in sentences: | |
if not sentence.strip(): | |
continue | |
# If sentence is already too long, break on commas | |
if len(sentence) > max_chars: | |
parts = sentence.split(",") | |
for part in parts: | |
if len(current_chunk) + len(part) <= max_chars: | |
current_chunk += part + "," | |
else: | |
# If part is still too long, break on whitespace | |
if len(part) > max_chars: | |
words = part.split() | |
for word in words: | |
if len(current_chunk) + len(word) > max_chars: | |
chunks.append(current_chunk.strip()) | |
current_chunk = word + " " | |
else: | |
current_chunk += word + " " | |
else: | |
chunks.append(current_chunk.strip()) | |
current_chunk = part + "," | |
else: | |
if len(current_chunk) + len(sentence) <= max_chars: | |
current_chunk += sentence | |
else: | |
chunks.append(current_chunk.strip()) | |
current_chunk = sentence | |
if current_chunk: | |
chunks.append(current_chunk.strip()) | |
return chunks | |
def count_tokens(text: str) -> int: | |
"""Count tokens in text using tiktoken""" | |
enc = tiktoken.get_encoding("cl100k_base") | |
return len(enc.encode(text)) | |