import pandas as pd SUPPORTED_TYPES = [".csv", ".json", ".xlsx"] def hello_world(): return "hello world!" def load_file(file): """ Takes a file given by Streamlit and loads into a DataFrame. Returns a DataFrame, metadata, and result string. @param file: File uploaded into streamlit. @rtype: tuple @return: A tuple of format (pd.DataFrame, (str, str), str). """ df = None if file is None: return df, ("", ""), "" filename = file.name extension = filename.split(".")[-1] metadata = (filename, extension) import_functions = { "csv": pd.read_csv, "json": pd.read_json, "xlsx": pd.read_excel } try: reader = import_functions.get(extension, None) if reader is None: return df, metadata, f"Error: Invalid extension '{extension}'" df = reader(file) rows, columns = df.shape return df, metadata, f"File '{filename}' loaded successfully.\nFound {rows} rows, {columns} columns." except Exception as error: return df, metadata, f"Error: Unable to read file '{filename}' ({type(error)}: {error})" def create_file(df, extension): """ Prepares a dataframe from streamlit for download. @type df: pd.DataFrame @param df: A DataFrame to package into a file. @type extension: pd.DataFrame @param extension: The desired filetype. @return: A file container ready for download. """ export_functions = { "csv": pd.DataFrame.to_csv, "json": pd.DataFrame.to_json, "xlsx": pd.DataFrame.to_excel } exporter = export_functions.get(extension, None) if exporter is None: return None return exporter(df) def data_cleaner(df, drop_missing=False, remove_duplicates=True): """ Takes a DataFrame and removes empty and duplicate entries. @type df: pd.DataFrame @param df: A DataFrame of uncleaned data. @type drop_missing: bool @param drop_missing: Determines if rows with any missing values are dropped ("any"), or just empty rows ("all"). @type remove_duplicates: bool @param remove_duplicates: Determines if duplicate rows are removed. @rtype: pd.DataFrame @return: A DataFrame with requested cleaning applied """ df = df.dropna(how="any" if drop_missing else "all") if remove_duplicates: df = df.drop_duplicates() return df