File size: 5,261 Bytes
bacf16b
 
ff0c43b
bacf16b
 
82579ab
 
 
 
 
 
bacf16b
 
 
82579ab
bacf16b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82579ab
 
 
 
bacf16b
 
 
 
 
82579ab
bacf16b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82579ab
bacf16b
 
 
 
82579ab
bacf16b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82579ab
bacf16b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import pandas as pd
from pathlib import Path
import typer

def process_csv(
        input_dir: Path,
        output_file: Path,
        event_type_filter: str = 'egv',
        drop_duplicates: bool = True,
        time_diff_minutes: int = 1,
        chunk_size: int = 1000,
) -> pd.DataFrame:

    # Read CSV file into a DataFrame
    df = pd.read_csv(input_dir, low_memory=False)


    # Filter by Event Type and Event Subtype
    df = df[df['Event Type'].str.lower() == event_type_filter]
    df = df[df['Event Subtype'].isna()]

    # List of columns to keep
    columns_to_keep = [
        'Index',
        'Timestamp (YYYY-MM-DDThh:mm:ss)',
        'Glucose Value (mg/dL)',
    ]

    # Keep only the specified columns
    df = df[columns_to_keep]

    # Rename columns
    column_rename = {
        'Index': 'id',
        'Timestamp (YYYY-MM-DDThh:mm:ss)': 'time',
        'Glucose Value (mg/dL)': 'gl'
    }
    df = df.rename(columns=column_rename)

    df['id'] = df['id'].astype(int)
    df = df.dropna(subset=['id'])  # Drops rows where the index is NaN


    
    # Handle id assignment based on chunk_size
    if chunk_size is None or chunk_size == 0:
        df['id'] = 1  # Assign the same id to all rows
    else:
        df['id'] = (df.index // chunk_size).astype(int)

    # Convert timestamp to datetime
    df['time'] = pd.to_datetime(df['time'])

    # Calculate time difference and keep rows with at least the specified time difference
    df['time_diff'] = df['time'].diff()
    df = df[df['time_diff'].isna() | (df['time_diff'] >= pd.Timedelta(minutes=time_diff_minutes))]

    # Drop the temporary time_diff column
    df = df.drop(columns=['time_diff'])

    # Ensure glucose values are in float64
    df['gl'] = df['gl'].astype('float64')

    # Optionally drop duplicate rows based on time
    if drop_duplicates:
        df = df.drop_duplicates(subset=['time'], keep='first')

    # Write the modified dataframe to a new CSV file
    df.to_csv(output_file, index=False)

    #typer.echo("CSV file has been successfully processed.")

    return df


'''
def process_multiple_csv(
        input_dir: Path = typer.Argument('./raw_data/livia_unmerged', help="Directory containing the input CSV files."),
        output_file: Path = typer.Argument('./raw_data/livia_unmerged/livia_mini.csv', help="Path to save the processed CSV file."),
        event_type_filter: str = typer.Option('egv', help="Event type to filter by."),
        drop_duplicates: bool = typer.Option(True, help="Whether to drop duplicate timestamps."),
        time_diff_minutes: int = typer.Option(1, help="Minimum time difference in minutes to keep a row."),
        chunk_size: int = typer.Option(1000, help="Chunk size for the 'id' column increment. Set to 0 or None for a single id."),
):
    # Get all the CSV files in the specified directory
    all_files = list(input_dir.glob("*.csv"))

    # List to store the DataFrames
    df_list = []

    # Read each CSV file into a DataFrame and append to the list
    for filename in all_files:
        df = pd.read_csv(filename, low_memory=False)
        df_list.append(df)

    # Concatenate all DataFrames in the list
    combined_df = pd.concat(df_list, ignore_index=True)

    # Filter by Event Type and Event Subtype
    combined_df = combined_df[combined_df['Event Type'].str.lower() == event_type_filter]
    combined_df = combined_df[combined_df['Event Subtype'].isna()]

    # List of columns to keep
    columns_to_keep = [
        'Index',
        'Timestamp (YYYY-MM-DDThh:mm:ss)',
        'Glucose Value (mg/dL)',
    ]

    # Keep only the specified columns
    combined_df = combined_df[columns_to_keep]

    # Rename columns
    column_rename = {
        'Index': 'id',
        'Timestamp (YYYY-MM-DDThh:mm:ss)': 'time',
        'Glucose Value (mg/dL)': 'gl'
    }
    combined_df = combined_df.rename(columns=column_rename)

    # Sort the combined DataFrame by timestamp
    combined_df = combined_df.sort_values('time')

    # Handle id assignment based on chunk_size
    if chunk_size is None or chunk_size == 0:
        combined_df['id'] = 1  # Assign the same id to all rows
    else:
        combined_df['id'] = ((combined_df.index // chunk_size) % (combined_df.index.max() // chunk_size + 1)).astype(int)

    # Convert timestamp to datetime
    combined_df['time'] = pd.to_datetime(combined_df['time'])

    # Calculate time difference and keep rows with at least the specified time difference
    combined_df['time_diff'] = combined_df['time'].diff()
    combined_df = combined_df[combined_df['time_diff'].isna() | (combined_df['time_diff'] >= pd.Timedelta(minutes=time_diff_minutes))]

    # Drop the temporary time_diff column
    combined_df = combined_df.drop(columns=['time_diff'])

    # Ensure glucose values are in float64
    combined_df['gl'] = combined_df['gl'].astype('float64')

    # Optionally drop duplicate rows based on time
    if drop_duplicates:
        combined_df = combined_df.drop_duplicates(subset=['time'], keep='first')

    # Write the modified dataframe to a new CSV file
    combined_df.to_csv(output_file, index=False)

    typer.echo("CSV files have been successfully merged, modified, and saved.")
'''
if __name__ == "__main__":
    typer.run(process_csv)