{ "cells": [ { "cell_type": "code", "execution_count": 14, "id": "460d90da-b986-4c1c-8a66-eab144b0ba8d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Started Fetching Data\n", "Failed to fetch data, retrying. Attempt 1/10\n", "Failed to fetch data, retrying. Attempt 1/10\n", "Fetched data for all the Pages.\n" ] } ], "source": [ "import requests\n", "import time\n", "\n", "import random\n", "pages = [\n", " random.randint(1, 968000015)\n", " for _ in range(500)\n", " ]\n", "# print(pages)\n", "\n", "base_url = \"https://datasets-server.huggingface.co/rows\"\n", "params = {\n", " \"dataset\": \"tiiuae/falcon-refinedweb\",\n", " \"config\": \"default\",\n", " \"split\": \"train\",\n", " }\n", "# response = requests.get(base_url, params=params)\n", "# response.raise_for_status()\n", "# for row in response.json()[\"rows\"]:\n", "# content = row[\"row\"][\"content\"]\n", "num_rows_per_page = 100\n", "retry_limit = 10\n", "retry_delay = 5\n", "Falcon = []\n", "\n", "print('Started Fetching Data')\n", "def fetch_data_for_page(page):\n", " params[\"offset\"] = page\n", " params[\"limit\"] = num_rows_per_page\n", " attempt = 0\n", " while attempt < retry_limit:\n", " try:\n", " response = requests.get(base_url, params=params)\n", " response.raise_for_status() # This will raise an HTTPError if the HTTP request returned an unsuccessful status code\n", " for row in response.json()[\"rows\"]:\n", " content = row[\"row\"][\"content\"]\n", " Falcon.append(content)\n", " len(Falcon)\n", " #print(f\"Fetched data for all the Pages.\")\n", " break\n", " except requests.exceptions.HTTPError as e:\n", " attempt += 1\n", " print(\n", " f\"Failed to fetch data, retrying. Attempt {attempt}/{retry_limit}\"\n", " )\n", " if attempt < retry_limit:\n", " time.sleep(retry_delay) # Wait before the next retry\n", " else:\n", " print(\n", " \"Maximum retry limit reached. Unable to fetch data.\"\n", " )\n", " raise\n", "\n", "for page in pages:\n", " fetch_data_for_page(page)\n", "\n", "print(f\"Fetched data for all the Pages.\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "f8f3baf1-5480-450b-a456-174a5c114d3e", "metadata": {}, "outputs": [], "source": [ "import csv\n", "\n", "# Open the CSV file for writing\n", "with open(\"FalconData2.csv\", \"w\", newline=\"\") as csvfile:\n", " # Create a CSV writer object\n", " writer = csv.writer(csvfile)\n", "\n", " # Write the header row\n", " writer.writerow([\"Text\"])\n", "\n", " # Write each element in the list as a row in the CSV file\n", " for element in Falcon:\n", " writer.writerow([element])\n" ] }, { "cell_type": "code", "execution_count": 30, "id": "ea47c936-2c2b-4414-ba57-74fb6827ec0a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of duplicate rows: 5\n", " Text\n", "522 Name:\n", "11746 Description.\\nReviews\\nThere are no reviews yet.\n", "17606 Description.\\nReviews\\nThere are no reviews yet.\n", "30436 NaN\n", "42549 !\\n\n" ] } ], "source": [ "import pandas as pd\n", "\n", "# Read the CSV file into a pandas DataFrame\n", "df = pd.read_csv(\"FalconData2.csv\")\n", "\n", "# Check for duplicate rows\n", "duplicate_rows = df[df.duplicated()]\n", "\n", "# Print the number of duplicate rows\n", "print(f\"Number of duplicate rows: {len(duplicate_rows)}\")\n", "\n", "# Print the duplicate rows\n", "print(duplicate_rows)" ] }, { "cell_type": "code", "execution_count": 31, "id": "f4178cd6-747f-4e05-a9bf-17b97f959e06", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Text
0[…]\\nM&S bank […]\\nLowest unsecured loan rate...
1JavaScript seems to be disabled in your browse...
2CMTech has designed a game to foster social in...
3A Storyteller's Point of View\\nMy\\nWriting\\nLe...
4mspu.us was registered 1 decade 3 years ago. I...
\n", "
" ], "text/plain": [ " Text\n", "0 […]\\nM&S bank […]\\nLowest unsecured loan rate...\n", "1 JavaScript seems to be disabled in your browse...\n", "2 CMTech has designed a game to foster social in...\n", "3 A Storyteller's Point of View\\nMy\\nWriting\\nLe...\n", "4 mspu.us was registered 1 decade 3 years ago. I..." ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 32, "id": "264548c1-4cf4-441f-a433-2f5d57861dc4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Text
49995Alumni in Action: Grace Heyne Lybrand\\nWhen Gr...
49996This.\\n51.351813 -105.220438\\n12 replies on “L...
49997VIDEO 1: Panel discussion with John Nichols, a...
49998The Prototype DA-2A made its first flight on M...
49999default search action\\nBibTeX record journals/...
\n", "
" ], "text/plain": [ " Text\n", "49995 Alumni in Action: Grace Heyne Lybrand\\nWhen Gr...\n", "49996 This.\\n51.351813 -105.220438\\n12 replies on “L...\n", "49997 VIDEO 1: Panel discussion with John Nichols, a...\n", "49998 The Prototype DA-2A made its first flight on M...\n", "49999 default search action\\nBibTeX record journals/..." ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.tail()" ] }, { "cell_type": "code", "execution_count": 33, "id": "3f215b09-8050-4477-860c-d3ed0a19f45d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of Words:\n", "0 65\n", "1 79\n", "2 287\n", "3 302\n", "4 130\n", " ... \n", "49995 64\n", "49996 325\n", "49997 58\n", "49998 623\n", "49999 67\n", "Name: Text, Length: 50000, dtype: int64\n", "Smallest Row:\n", "Text This\n", "Name: 270, dtype: object\n", "\n", "Largest Row:\n", "Text MAMMALS\\n400. Abu Jafar, M.Z., and C. Hays-Sha...\n", "Name: 33020, dtype: object\n" ] } ], "source": [ "# Calculate the word count for each row without storing it as a column\n", "word_counts = df['Text'].apply(lambda x: len(str(x).split()))\n", "\n", "\n", "print(\"Number of Words:\")\n", "print(word_counts)\n", "\n", "# print(\"Smallest Count\")\n", "# print(word_counts.min())\n", "\n", "# print(\"Largest Count\")\n", "# print(word_counts.max())\n", "\n", "# Find the row with the smallest word count\n", "smallest_row = df.loc[word_counts.idxmin()]\n", "\n", "# Find the row with the largest word count\n", "largest_row = df.loc[word_counts.idxmax()]\n", "\n", "# Display the smallest and largest rows\n", "print(\"Smallest Row:\")\n", "print(smallest_row)\n", "\n", "print(\"\\nLargest Row:\")\n", "print(largest_row)\n" ] }, { "cell_type": "code", "execution_count": 34, "id": "be5a87a8-cfee-4f63-992e-8fa1d4a5cdbb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text NaN\n", "Name: 30436, dtype: object" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_row=30436\n", "specific_row = df.iloc[target_row]\n", "specific_row" ] }, { "cell_type": "code", "execution_count": 13, "id": "e97d9e18-eaa0-4a1b-96ab-c89a0f4c738d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Text The old wireline Bell telephone system was bui...\n", "Name: 19995, dtype: object\n" ] } ], "source": [ "print(specific_row)" ] }, { "cell_type": "code", "execution_count": 14, "id": "940ef35f-7517-403d-9f42-73760182dcaa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Text The old wireline Bell telephone system was bui...\n" ] } ], "source": [ "print(specific_row.to_string())" ] }, { "cell_type": "code", "execution_count": 17, "id": "915ac669-718f-47f5-b175-a5f928b407db", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "57\n" ] } ], "source": [ "print(len(specific_row.to_string()))" ] }, { "cell_type": "code", "execution_count": 24, "id": "ab5ee254-9ba7-496b-97c7-3b6185c21971", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training set size: 49000\n", "Validation set size: 1000\n" ] } ], "source": [ "# import pandas as pd\n", "\n", "# # Load the dataset\n", "# df = pd.read_csv(\"FalconData2.csv\")\n", "\n", "# # Calculate the index to split the data at the last 10%\n", "# split_index = int(len(df) * 0.980)\n", "\n", "# # Split the data into training and validation sets\n", "# train_df = df.iloc[:split_index] # First 90% for training\n", "# validation_df = df.iloc[split_index:] # Last 10% for validation\n", "\n", "# # Display the sizes of the training and validation sets\n", "# print(f\"Training set size: {len(train_df)}\")\n", "# print(f\"Validation set size: {len(validation_df)}\")\n", "\n", "# # Optionally, save the datasets to new CSV files\n", "# train_df.to_csv(\"FalconData_train2.csv\", index=False)\n", "# validation_df.to_csv(\"FalconData_validation2.csv\", index=False)\n" ] }, { "cell_type": "code", "execution_count": 35, "id": "7a16fb10-40cd-4668-b363-57ca64819ad3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of rows removed due to NaN values: 2\n", "Training set size: 48998\n", "Validation set size: 1000\n" ] } ], "source": [ "import pandas as pd\n", "\n", "# Load the dataset\n", "df = pd.read_csv(\"FalconData2.csv\")\n", "\n", "# Check for NaN values and remove rows with NaN values\n", "# df = df.dropna()\n", "original_length = len(df)\n", "\n", "df = df.dropna()\n", "\n", "removed_rows = original_length - len(df)\n", "print(f\"Number of rows removed due to NaN values: {removed_rows}\")\n", "\n", "# Calculate the index to split the data at the last 2%\n", "split_index = int(len(df) * 0.98)\n", "\n", "# Split the data into training and validation sets\n", "train_df = df.iloc[:split_index] # First 98% for training\n", "validation_df = df.iloc[split_index:] # Last 2% for validation\n", "\n", "# Display the sizes of the training and validation sets\n", "print(f\"Training set size: {len(train_df)}\")\n", "print(f\"Validation set size: {len(validation_df)}\")\n", "\n", "# Save the datasets to new CSV files\n", "train_df.to_csv(\"FalconData_train2.csv\", index=False)\n", "validation_df.to_csv(\"FalconData_validation2.csv\", index=False)\n" ] }, { "cell_type": "code", "execution_count": 36, "id": "55d929c5-c198-4a91-b31d-65dd83fa00d2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of duplicate rows: 4\n", " Text\n", "522 Name:\n", "11745 Description.\\nReviews\\nThere are no reviews yet.\n", "17605 Description.\\nReviews\\nThere are no reviews yet.\n", "42547 !\\n\n" ] } ], "source": [ "# Read the CSV file into a pandas DataFrame\n", "df1 = pd.read_csv(\"FalconData_train2.csv\")\n", "\n", "# Check for duplicate rows\n", "duplicate_rows1 = df1[df1.duplicated()]\n", "\n", "# Print the number of duplicate rows\n", "print(f\"Number of duplicate rows: {len(duplicate_rows1)}\")\n", "\n", "# Print the duplicate rows\n", "print(duplicate_rows1)" ] }, { "cell_type": "code", "execution_count": 26, "id": "3cc404d9-e85e-48ff-aa34-750ebe3e3d3c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Text
0[…]\\nM&S bank […]\\nLowest unsecured loan rate...
1JavaScript seems to be disabled in your browse...
2CMTech has designed a game to foster social in...
3A Storyteller's Point of View\\nMy\\nWriting\\nLe...
4mspu.us was registered 1 decade 3 years ago. I...
\n", "
" ], "text/plain": [ " Text\n", "0 […]\\nM&S bank […]\\nLowest unsecured loan rate...\n", "1 JavaScript seems to be disabled in your browse...\n", "2 CMTech has designed a game to foster social in...\n", "3 A Storyteller's Point of View\\nMy\\nWriting\\nLe...\n", "4 mspu.us was registered 1 decade 3 years ago. I..." ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.head()" ] }, { "cell_type": "code", "execution_count": 27, "id": "641c606f-6f7f-4097-a8de-a9f6be0047b1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Text
48995A Chenango County man was charged Wednesday wi...
489962-Tone Black Personalized Embroidered One Init...
48997NARAL Pro-Choice America PAC Endorses Colleen ...
48998Posts Tagged by Thomas Paine\\nAEI Hosts Peter ...
48999Pantry feeds families in need\\n- Details\\n- Ca...
\n", "
" ], "text/plain": [ " Text\n", "48995 A Chenango County man was charged Wednesday wi...\n", "48996 2-Tone Black Personalized Embroidered One Init...\n", "48997 NARAL Pro-Choice America PAC Endorses Colleen ...\n", "48998 Posts Tagged by Thomas Paine\\nAEI Hosts Peter ...\n", "48999 Pantry feeds families in need\\n- Details\\n- Ca..." ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.tail()" ] }, { "cell_type": "code", "execution_count": null, "id": "b8f7dbf6-5d74-4f8f-85d0-e890a5b8d152", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }