anonymizer / modules.py
ziggycross's picture
Added checkboxes for cleaning options.
3b7db7f
raw
history blame
2.39 kB
import pandas as pd
SUPPORTED_TYPES = [".csv", ".json", ".xlsx"]
def hello_world(): return "hello world!"
def load_file(file):
"""
Takes a file given by Streamlit and loads into a DataFrame.
Returns a DataFrame, metadata, and result string.
@param file: File uploaded into streamlit.
@rtype: tuple
@return: A tuple of format (pd.DataFrame, (str, str), str).
"""
df = None
if file is None: return df, ("", ""), ""
filename = file.name
extension = filename.split(".")[-1]
metadata = (filename, extension)
import_functions = {
"csv": pd.read_csv,
"json": pd.read_json,
"xlsx": pd.read_excel
}
try:
reader = import_functions.get(extension, None)
if reader is None:
return df, metadata, f"Error: Invalid extension '{extension}'"
df = reader(file)
rows, columns = df.shape
return df, metadata, f"File '{filename}' loaded successfully.\nFound {rows} rows, {columns} columns."
except Exception as error:
return df, metadata, f"Error: Unable to read file '{filename}' ({type(error)}: {error})"
def create_file(df, extension):
"""
Prepares a dataframe from streamlit for download.
@type df: pd.DataFrame
@param df: A DataFrame to package into a file.
@type extension: pd.DataFrame
@param extension: The desired filetype.
@return: A file container ready for download.
"""
export_functions = {
"csv": pd.DataFrame.to_csv,
"json": pd.DataFrame.to_json,
"xlsx": pd.DataFrame.to_excel
}
exporter = export_functions.get(extension, None)
if exporter is None: return None
return exporter(df)
def data_cleaner(df, drop_missing=False, remove_duplicates=True):
"""
Takes a DataFrame and removes empty and duplicate entries.
@type df: pd.DataFrame
@param df: A DataFrame of uncleaned data.
@type drop_missing: bool
@param drop_missing: Determines if rows with any missing values are dropped ("any"), or just empty rows ("all").
@type remove_duplicates: bool
@param remove_duplicates: Determines if duplicate rows are removed.
@rtype: pd.DataFrame
@return: A DataFrame with requested cleaning applied
"""
df = df.dropna(how="any" if drop_missing else "all")
if remove_duplicates: df = df.drop_duplicates()
return df