# Description

This script processes image datasets stored in `.zip` files for specific colors (`Green`, `Orange`, `Blue`). It extracts the images, randomly shuffles them, and splits them into 70% training and 30% testing sets. All images are resized to 224x224 pixels, roatated and saved into separate folders (`Train` and `Test`) within the same Google Drive directory. Simultaneously, the GPS coordinates (Latitude and Longitude) from images in the training and testing datasets are extracted and stored together with the image names in the same folder, under the files named "test_coordinates.csv" and "train_coordinates.csv." This process is repeated for each color dataset, ensuring the images are organized and prepared for machine learning tasks.

# Imports

In [None]:
!pip install exifread



In [None]:
import os
import exifread, csv
import zipfile
import random
from PIL import Image
from pathlib import Path
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Helper Funcitons

In [None]:
# for extracting gps coodinates, as specified in the helper notebook
def get_exif_data(image_path):
    with open(image_path, 'rb') as image_file:
        tags = exifread.process_file(image_file)
    return tags

def export_exif_to_json(exif_data, output_file):
    # Convert tags to a serializable format
    exif_data_serializable = {str(tag): str(value) for tag, value in exif_data.items()}
    with open(output_file, 'w') as json_file:
        json.dump(exif_data_serializable, json_file, indent=4)

# Function to convert GPS coordinates in degrees, minutes, and seconds to decimal degrees
def convert_to_decimal_degrees(value):
    d, m, s = value.values
    return d.num / d.den + (m.num / m.den) / 60 + (s.num / s.den) / 3600

# extract files from zip
def extract_zip(zip_file, extract_to):
    with zipfile.ZipFile(zip_file, 'r') as z:
        z.extractall(extract_to)

# resize images to 224 by 224
def resize_image(input_path, output_path, size=(224, 224)):
    with Image.open(input_path) as img:
      try:
        exif = img._getexif()
        if exif:
            orientation = exif.get(274)
            if orientation == 3:
                img = img.rotate(180, expand=True)
            elif orientation == 6:
                img = img.rotate(270, expand=True)
            elif orientation == 8:
                img = img.rotate(90, expand=True)
      except (AttributeError, KeyError, IndexError):
          pass
    # Resize the imag
    img_resized = img.resize(size)
    img_resized.save(output_path)

# process color data
def process_color(folder_path, output_folder, color, train_csv, test_csv):
    color_abbreviation = {"Green": "g", "Orange": "o", "Blue": "b"}[color]

    # shuffle color
    image_files = list(Path(folder_path).rglob("*.JPG"))
    random.shuffle(image_files)

    # train and test split
    split_index = int(0.7 * len(image_files))
    train_files = image_files[:split_index]
    test_files = image_files[split_index:]

    # train and test output directories
    train_output_dir = Path(output_folder) / "Train"
    test_output_dir = Path(output_folder) / "Test"

    # processing
    for split, files, output_dir, csv_file in [
        ("train", train_files, train_output_dir, train_csv),
        ("test", test_files, test_output_dir, test_csv),
    ]:
        with open(csv_file, mode='a', newline='') as csv_file:
            fieldnames = ["file_name", "Latitude", "Longitude"]
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
            if os.stat(csv_file.name).st_size == 0:  # Write header if file is empty
                writer.writeheader()

            for i, file in enumerate(files):
                exif_data = get_exif_data(file)
                if exif_data:
                    gps_latitude = exif_data.get("GPS GPSLatitude")
                    gps_latitude_ref = exif_data.get("GPS GPSLatitudeRef")
                    gps_longitude = exif_data.get("GPS GPSLongitude")
                    gps_longitude_ref = exif_data.get("GPS GPSLongitudeRef")

                    if gps_latitude and gps_longitude:
                        # Convert latitude and longitude to decimal degrees
                        latitude = convert_to_decimal_degrees(gps_latitude)
                        longitude = convert_to_decimal_degrees(gps_longitude)

                        # Adjust for N/S and E/W references
                        if gps_latitude_ref and gps_latitude_ref.values[0] == "S":
                            latitude = -latitude
                        if gps_longitude_ref and gps_longitude_ref.values[0] == "W":
                            longitude = -longitude

                        # Write GPS data to CSV
                        renamed_file = f"{file.stem}_{color_abbreviation}{file.suffix}"
                        writer.writerow({"file_name": renamed_file, "Latitude": latitude, "Longitude": longitude})

                # Resize and save image to the appropriate folder
                output_file = output_dir / f"{file.stem}_{color_abbreviation}{file.suffix}"
                resize_image(file, output_file)

    print(f"Processed {color}: {len(train_files)} train, {len(test_files)} test images.")

In [None]:
blue = '/content/drive/MyDrive/CIS 5190 Project Folder/Photos/Blue.zip'
blue_path_first = "/content/drive/MyDrive/CIS 5190 Project Folder/Photos/Blue_extracted"
extract_zip(blue, blue_path_first)

# Creating the Dataset and Extracting GPS coordinates

Processed Green: 795 train, 341 test images.

Processed Orange: 2128 train, 912 test images.

Processed Blue: 1546 train, 663 test images.

In [None]:
google_drive_folder = "/content/drive/MyDrive/CIS 5190 Project Folder/Photos"
green_path = "/content/drive/MyDrive/CIS 5190 Project Folder/Photos/Green_extracted/Green"
orange_path = "/content/drive/MyDrive/CIS 5190 Project Folder/Photos/Orange_extracted/Orange"
blue_path = "/content/drive/MyDrive/CIS 5190 Project Folder/Photos/Blue_extracted/Blue"

# output CSVs
train_csv = os.path.join(google_drive_folder, "train_coordinates.csv")
test_csv = os.path.join(google_drive_folder, "test_coordinates.csv")
open(train_csv, "w").close()
open(test_csv, "w").close()

# Process colors
colors = ["Blue","Green", "Orange"]
for color in colors:
    if color == "Green":
        file_path = green_path
    elif color == "Orange":
        file_path = orange_path
    elif color == "Blue":
        file_path = blue_path
    process_color(file_path, google_drive_folder, color, train_csv, test_csv)

Processed Blue: 1546 train, 663 test images.


# Save Datasets from Drive to Hugging Face 2

In [None]:
!pip install huggingface_hub

In [None]:
from datasets import Dataset
from PIL import Image
import pandas as pd
import os

# Define a function to load and embed images as PIL objects
def load_image_as_pil(row, base_path):
    image_path = os.path.join(base_path, row["file_name"])  # Combine base path and file name
    image = Image.open(image_path).convert("RGB")  # Ensure image is in RGB format
    return {"image": image}

# Paths to train and test CSVs and image directories
train_csv_path = "/content/drive/MyDrive/CIS 5190 Project Folder/Photos/train/train.csv"
test_csv_path = "/content/drive/MyDrive/CIS 5190 Project Folder/Photos/test/test.csv"
train_image_base_path = "/content/drive/MyDrive/CIS 5190 Project Folder/Photos/train/images/"
test_image_base_path = "/content/drive/MyDrive/CIS 5190 Project Folder/Photos/test/images/"

# Process train dataset
train_df = pd.read_csv(train_csv_path)
train_dataset = Dataset.from_pandas(train_df)

# Map function to load images for the train dataset
train_dataset = train_dataset.map(lambda row: load_image_as_pil(row, train_image_base_path))

# Remove the file_name column (optional)
train_dataset = train_dataset.remove_columns("file_name")

# Save the train dataset
train_dataset.save_to_disk("train_dataset_with_images")

# Process test dataset
test_df = pd.read_csv(test_csv_path)
test_dataset = Dataset.from_pandas(test_df)

# Map function to load images for the test dataset
test_dataset = test_dataset.map(lambda row: load_image_as_pil(row, test_image_base_path))

# Remove the file_name column (optional)
test_dataset = test_dataset.remove_columns("file_name")

# Save the test dataset
test_dataset.save_to_disk("test_dataset_with_images")

Map:   0%|          | 0/4469 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4469 [00:00<?, ? examples/s]

Map:   0%|          | 0/1916 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1916 [00:00<?, ? examples/s]

In [None]:
train_dataset.push_to_hub("CIS-5190-CIA/Training_images", private=False)
test_dataset.push_to_hub("CIS-5190-CIA/Testing_images", private=False)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/4469 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/45 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1916 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/CIS-5190-CIA/Testing_images/commit/0d87985b40b5d7076ade14d1d8122b1874351e92', commit_message='Upload dataset', commit_description='', oid='0d87985b40b5d7076ade14d1d8122b1874351e92', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/CIS-5190-CIA/Testing_images', endpoint='https://huggingface.co', repo_type='dataset', repo_id='CIS-5190-CIA/Testing_images'), pr_revision=None, pr_num=None)