from datasets import load_dataset, concatenate_datasets from datasets import Dataset from langchain.docstore.document import Document as LangchainDocument from sentence_transformers import SentenceTransformer #from langchain_community.document_loaders import WebBaseLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import TextLoader, DirectoryLoader from sentence_transformers import SentenceTransformer from huggingface_hub import Repository, upload_file from datasets import Dataset import pandas as pd import os DATA_PATH='./data' HF_TOKEN = os.getenv('HF_Token') #dataset = load_dataset("Namitg02/Test", split='train', streaming=False) ##url = "https://www.webmd.com/" #loader = WebBaseLoader(url) #document = loader.load() def create_vector_db(): loader = DirectoryLoader(DATA_PATH, glob='*.txt', loader_cls=TextLoader, show_progress=True) document =loader.load() # split the document into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=70) texts = text_splitter.split_documents(document) print(texts[1]) print(texts[3]) print(texts[17]) df = pd.DataFrame(texts) column_headers = list(df.columns.values) print(column_headers) pd.options.display.max_colwidth = 400 df = df.drop(columns=[1, 2]) print(df.iloc[[3]]) df[0] = df[0].astype('string', errors='raise').copy() datatypes = df.dtypes print(datatypes) df[0] = df[0].str[18:] df[0] = df[0].str[:-2] print(df.iloc[[3]]) embedding_model = SentenceTransformer("all-MiniLM-L6-v2") df['embeddings'] = df[0].apply(lambda x: embedding_model.encode(x)) print(df.iloc[[17]]) datasettextfile = Dataset.from_pandas(df) print("check2b") print(datasettextfile[3]) datapdf1 = load_dataset("Namitg02/ADASOF24", split='train', streaming=False) datapdf2 = load_dataset("Namitg02/Krause1", split='train', streaming=False) datapdf3 = load_dataset("Namitg02/Krause2", split='train', streaming=False) # datapdf4 = load_dataset("Namitg02/Krause3", split='train', streaming=False) dataset_combine = concatenate_datasets([datasettextfile, datapdf1,datapdf2, datapdf3]) dataset_combine.push_to_hub("Namitg02/Test",token = HF_TOKEN) if __name__ == "__main__": print("check31") create_vector_db()