rohan13 commited on
Commit
0cad0b3
·
1 Parent(s): 028ac25

Custom csv loader

Browse files
Files changed (1) hide show
  1. custom_csv_loader.py +114 -0
custom_csv_loader.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ from langchain.docstore.document import Document
5
+ from langchain.document_loaders.base import BaseLoader
6
+ from langchain.document_loaders.unstructured import (
7
+ UnstructuredFileLoader,
8
+ validate_unstructured_version,
9
+ )
10
+
11
+
12
+ class CSVLoader(BaseLoader):
13
+ """Loads a CSV file into a list of documents.
14
+
15
+ Each document represents one row of the CSV file. Every row is converted into a
16
+ key/value pair and outputted to a new line in the document's page_content.
17
+
18
+ The source for each document loaded from csv is set to the value of the
19
+ `file_path` argument for all documents by default.
20
+ You can override this by setting the `source_column` argument to the
21
+ name of a column in the CSV file.
22
+ The source of each document will then be set to the value of the column
23
+ with the name specified in `source_column`.
24
+
25
+ Output Example:
26
+ .. code-block:: txt
27
+
28
+ column1: value1
29
+ column2: value2
30
+ column3: value3
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ file_path: str,
36
+ source_column: Optional[str] = None,
37
+ csv_args: Optional[Dict] = None,
38
+ encoding: Optional[str] = None,
39
+ ):
40
+ """
41
+
42
+ Args:
43
+ file_path: The path to the CSV file.
44
+ source_column: The name of the column in the CSV file to use as the source.
45
+ Optional. Defaults to None.
46
+ csv_args: A dictionary of arguments to pass to the csv.DictReader.
47
+ Optional. Defaults to None.
48
+ encoding: The encoding of the CSV file. Optional. Defaults to None.
49
+ """
50
+ self.file_path = file_path
51
+ self.source_column = source_column
52
+ self.encoding = encoding
53
+ self.csv_args = csv_args or {}
54
+
55
+ def load(self) -> List[Document]:
56
+ """Load data into document objects."""
57
+
58
+ docs = []
59
+ with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
60
+ csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore
61
+ for i, row in enumerate(csv_reader):
62
+ content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items() if k != 'restkey')
63
+ try:
64
+ source = (
65
+ row[self.source_column]
66
+ if self.source_column is not None
67
+ else self.file_path
68
+ )
69
+ except KeyError:
70
+ raise ValueError(
71
+ f"Source column '{self.source_column}' not found in CSV file."
72
+ )
73
+ metadata = {"source": source, "row": i}
74
+ doc = Document(page_content=content, metadata=metadata)
75
+ docs.append(doc)
76
+
77
+ return docs
78
+
79
+
80
+ class UnstructuredCSVLoader(UnstructuredFileLoader):
81
+ """Loader that uses unstructured to load CSV files. Like other
82
+ Unstructured loaders, UnstructuredCSVLoader can be used in both
83
+ "single" and "elements" mode. If you use the loader in "elements"
84
+ mode, the CSV file will be a single Unstructured Table element.
85
+ If you use the loader in "elements" mode, an HTML representation
86
+ of the table will be available in the "text_as_html" key in the
87
+ document metadata.
88
+
89
+ Examples
90
+ --------
91
+ from langchain.document_loaders.csv_loader import UnstructuredCSVLoader
92
+
93
+ loader = UnstructuredCSVLoader("stanley-cups.csv", mode="elements")
94
+ docs = loader.load()
95
+ """
96
+
97
+ def __init__(
98
+ self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
99
+ ):
100
+ """
101
+
102
+ Args:
103
+ file_path: The path to the CSV file.
104
+ mode: The mode to use when loading the CSV file.
105
+ Optional. Defaults to "single".
106
+ **unstructured_kwargs: Keyword arguments to pass to unstructured.
107
+ """
108
+ validate_unstructured_version(min_unstructured_version="0.6.8")
109
+ super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
110
+
111
+ def _get_elements(self) -> List:
112
+ from unstructured.partition.csv import partition_csv
113
+
114
+ return partition_csv(filename=self.file_path, **self.unstructured_kwargs)