medrag / tests /semantic_chunking.py
geekyrakshit's picture
update: app
170d9a9
raw
history blame contribute delete
449 Bytes
from medrag_multi_modal.semantic_chunking import SemanticChunker
def test_semantic_chunking():
chunker = SemanticChunker(chunk_size=256)
dataset = chunker.chunk(document_dataset="geekyrakshit/grays-anatomy-test")
assert dataset.num_rows == 49
assert dataset.column_names == [
"document_idx",
"text",
"page_idx",
"document_name",
"file_path",
"file_url",
"loader_name",
]