{
"cells": [
{
"cell_type": "code",
"execution_count": 14,
"id": "460d90da-b986-4c1c-8a66-eab144b0ba8d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Started Fetching Data\n",
"Failed to fetch data, retrying. Attempt 1/10\n",
"Failed to fetch data, retrying. Attempt 1/10\n",
"Fetched data for all the Pages.\n"
]
}
],
"source": [
"import requests\n",
"import time\n",
"\n",
"import random\n",
"pages = [\n",
" random.randint(1, 968000015)\n",
" for _ in range(500)\n",
" ]\n",
"# print(pages)\n",
"\n",
"base_url = \"https://datasets-server.huggingface.co/rows\"\n",
"params = {\n",
" \"dataset\": \"tiiuae/falcon-refinedweb\",\n",
" \"config\": \"default\",\n",
" \"split\": \"train\",\n",
" }\n",
"# response = requests.get(base_url, params=params)\n",
"# response.raise_for_status()\n",
"# for row in response.json()[\"rows\"]:\n",
"# content = row[\"row\"][\"content\"]\n",
"num_rows_per_page = 100\n",
"retry_limit = 10\n",
"retry_delay = 5\n",
"Falcon = []\n",
"\n",
"print('Started Fetching Data')\n",
"def fetch_data_for_page(page):\n",
" params[\"offset\"] = page\n",
" params[\"limit\"] = num_rows_per_page\n",
" attempt = 0\n",
" while attempt < retry_limit:\n",
" try:\n",
" response = requests.get(base_url, params=params)\n",
" response.raise_for_status() # This will raise an HTTPError if the HTTP request returned an unsuccessful status code\n",
" for row in response.json()[\"rows\"]:\n",
" content = row[\"row\"][\"content\"]\n",
" Falcon.append(content)\n",
" len(Falcon)\n",
" #print(f\"Fetched data for all the Pages.\")\n",
" break\n",
" except requests.exceptions.HTTPError as e:\n",
" attempt += 1\n",
" print(\n",
" f\"Failed to fetch data, retrying. Attempt {attempt}/{retry_limit}\"\n",
" )\n",
" if attempt < retry_limit:\n",
" time.sleep(retry_delay) # Wait before the next retry\n",
" else:\n",
" print(\n",
" \"Maximum retry limit reached. Unable to fetch data.\"\n",
" )\n",
" raise\n",
"\n",
"for page in pages:\n",
" fetch_data_for_page(page)\n",
"\n",
"print(f\"Fetched data for all the Pages.\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "f8f3baf1-5480-450b-a456-174a5c114d3e",
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"\n",
"# Open the CSV file for writing\n",
"with open(\"FalconData2.csv\", \"w\", newline=\"\") as csvfile:\n",
" # Create a CSV writer object\n",
" writer = csv.writer(csvfile)\n",
"\n",
" # Write the header row\n",
" writer.writerow([\"Text\"])\n",
"\n",
" # Write each element in the list as a row in the CSV file\n",
" for element in Falcon:\n",
" writer.writerow([element])\n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "ea47c936-2c2b-4414-ba57-74fb6827ec0a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of duplicate rows: 5\n",
" Text\n",
"522 Name:\n",
"11746 Description.\\nReviews\\nThere are no reviews yet.\n",
"17606 Description.\\nReviews\\nThere are no reviews yet.\n",
"30436 NaN\n",
"42549 !\\n\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"# Read the CSV file into a pandas DataFrame\n",
"df = pd.read_csv(\"FalconData2.csv\")\n",
"\n",
"# Check for duplicate rows\n",
"duplicate_rows = df[df.duplicated()]\n",
"\n",
"# Print the number of duplicate rows\n",
"print(f\"Number of duplicate rows: {len(duplicate_rows)}\")\n",
"\n",
"# Print the duplicate rows\n",
"print(duplicate_rows)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "f4178cd6-747f-4e05-a9bf-17b97f959e06",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Text | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" […]\\nM&S bank […]\\nLowest unsecured loan rate... | \n",
"
\n",
" \n",
" 1 | \n",
" JavaScript seems to be disabled in your browse... | \n",
"
\n",
" \n",
" 2 | \n",
" CMTech has designed a game to foster social in... | \n",
"
\n",
" \n",
" 3 | \n",
" A Storyteller's Point of View\\nMy\\nWriting\\nLe... | \n",
"
\n",
" \n",
" 4 | \n",
" mspu.us was registered 1 decade 3 years ago. I... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Text\n",
"0 […]\\nM&S bank […]\\nLowest unsecured loan rate...\n",
"1 JavaScript seems to be disabled in your browse...\n",
"2 CMTech has designed a game to foster social in...\n",
"3 A Storyteller's Point of View\\nMy\\nWriting\\nLe...\n",
"4 mspu.us was registered 1 decade 3 years ago. I..."
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "264548c1-4cf4-441f-a433-2f5d57861dc4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Text | \n",
"
\n",
" \n",
" \n",
" \n",
" 49995 | \n",
" Alumni in Action: Grace Heyne Lybrand\\nWhen Gr... | \n",
"
\n",
" \n",
" 49996 | \n",
" This.\\n51.351813 -105.220438\\n12 replies on “L... | \n",
"
\n",
" \n",
" 49997 | \n",
" VIDEO 1: Panel discussion with John Nichols, a... | \n",
"
\n",
" \n",
" 49998 | \n",
" The Prototype DA-2A made its first flight on M... | \n",
"
\n",
" \n",
" 49999 | \n",
" default search action\\nBibTeX record journals/... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Text\n",
"49995 Alumni in Action: Grace Heyne Lybrand\\nWhen Gr...\n",
"49996 This.\\n51.351813 -105.220438\\n12 replies on “L...\n",
"49997 VIDEO 1: Panel discussion with John Nichols, a...\n",
"49998 The Prototype DA-2A made its first flight on M...\n",
"49999 default search action\\nBibTeX record journals/..."
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.tail()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "3f215b09-8050-4477-860c-d3ed0a19f45d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of Words:\n",
"0 65\n",
"1 79\n",
"2 287\n",
"3 302\n",
"4 130\n",
" ... \n",
"49995 64\n",
"49996 325\n",
"49997 58\n",
"49998 623\n",
"49999 67\n",
"Name: Text, Length: 50000, dtype: int64\n",
"Smallest Row:\n",
"Text This\n",
"Name: 270, dtype: object\n",
"\n",
"Largest Row:\n",
"Text MAMMALS\\n400. Abu Jafar, M.Z., and C. Hays-Sha...\n",
"Name: 33020, dtype: object\n"
]
}
],
"source": [
"# Calculate the word count for each row without storing it as a column\n",
"word_counts = df['Text'].apply(lambda x: len(str(x).split()))\n",
"\n",
"\n",
"print(\"Number of Words:\")\n",
"print(word_counts)\n",
"\n",
"# print(\"Smallest Count\")\n",
"# print(word_counts.min())\n",
"\n",
"# print(\"Largest Count\")\n",
"# print(word_counts.max())\n",
"\n",
"# Find the row with the smallest word count\n",
"smallest_row = df.loc[word_counts.idxmin()]\n",
"\n",
"# Find the row with the largest word count\n",
"largest_row = df.loc[word_counts.idxmax()]\n",
"\n",
"# Display the smallest and largest rows\n",
"print(\"Smallest Row:\")\n",
"print(smallest_row)\n",
"\n",
"print(\"\\nLargest Row:\")\n",
"print(largest_row)\n"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "be5a87a8-cfee-4f63-992e-8fa1d4a5cdbb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Text NaN\n",
"Name: 30436, dtype: object"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"target_row=30436\n",
"specific_row = df.iloc[target_row]\n",
"specific_row"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "e97d9e18-eaa0-4a1b-96ab-c89a0f4c738d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Text The old wireline Bell telephone system was bui...\n",
"Name: 19995, dtype: object\n"
]
}
],
"source": [
"print(specific_row)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "940ef35f-7517-403d-9f42-73760182dcaa",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Text The old wireline Bell telephone system was bui...\n"
]
}
],
"source": [
"print(specific_row.to_string())"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "915ac669-718f-47f5-b175-a5f928b407db",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"57\n"
]
}
],
"source": [
"print(len(specific_row.to_string()))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "ab5ee254-9ba7-496b-97c7-3b6185c21971",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training set size: 49000\n",
"Validation set size: 1000\n"
]
}
],
"source": [
"# import pandas as pd\n",
"\n",
"# # Load the dataset\n",
"# df = pd.read_csv(\"FalconData2.csv\")\n",
"\n",
"# # Calculate the index to split the data at the last 10%\n",
"# split_index = int(len(df) * 0.980)\n",
"\n",
"# # Split the data into training and validation sets\n",
"# train_df = df.iloc[:split_index] # First 90% for training\n",
"# validation_df = df.iloc[split_index:] # Last 10% for validation\n",
"\n",
"# # Display the sizes of the training and validation sets\n",
"# print(f\"Training set size: {len(train_df)}\")\n",
"# print(f\"Validation set size: {len(validation_df)}\")\n",
"\n",
"# # Optionally, save the datasets to new CSV files\n",
"# train_df.to_csv(\"FalconData_train2.csv\", index=False)\n",
"# validation_df.to_csv(\"FalconData_validation2.csv\", index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "7a16fb10-40cd-4668-b363-57ca64819ad3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of rows removed due to NaN values: 2\n",
"Training set size: 48998\n",
"Validation set size: 1000\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"# Load the dataset\n",
"df = pd.read_csv(\"FalconData2.csv\")\n",
"\n",
"# Check for NaN values and remove rows with NaN values\n",
"# df = df.dropna()\n",
"original_length = len(df)\n",
"\n",
"df = df.dropna()\n",
"\n",
"removed_rows = original_length - len(df)\n",
"print(f\"Number of rows removed due to NaN values: {removed_rows}\")\n",
"\n",
"# Calculate the index to split the data at the last 2%\n",
"split_index = int(len(df) * 0.98)\n",
"\n",
"# Split the data into training and validation sets\n",
"train_df = df.iloc[:split_index] # First 98% for training\n",
"validation_df = df.iloc[split_index:] # Last 2% for validation\n",
"\n",
"# Display the sizes of the training and validation sets\n",
"print(f\"Training set size: {len(train_df)}\")\n",
"print(f\"Validation set size: {len(validation_df)}\")\n",
"\n",
"# Save the datasets to new CSV files\n",
"train_df.to_csv(\"FalconData_train2.csv\", index=False)\n",
"validation_df.to_csv(\"FalconData_validation2.csv\", index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "55d929c5-c198-4a91-b31d-65dd83fa00d2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of duplicate rows: 4\n",
" Text\n",
"522 Name:\n",
"11745 Description.\\nReviews\\nThere are no reviews yet.\n",
"17605 Description.\\nReviews\\nThere are no reviews yet.\n",
"42547 !\\n\n"
]
}
],
"source": [
"# Read the CSV file into a pandas DataFrame\n",
"df1 = pd.read_csv(\"FalconData_train2.csv\")\n",
"\n",
"# Check for duplicate rows\n",
"duplicate_rows1 = df1[df1.duplicated()]\n",
"\n",
"# Print the number of duplicate rows\n",
"print(f\"Number of duplicate rows: {len(duplicate_rows1)}\")\n",
"\n",
"# Print the duplicate rows\n",
"print(duplicate_rows1)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "3cc404d9-e85e-48ff-aa34-750ebe3e3d3c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Text | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" […]\\nM&S bank […]\\nLowest unsecured loan rate... | \n",
"
\n",
" \n",
" 1 | \n",
" JavaScript seems to be disabled in your browse... | \n",
"
\n",
" \n",
" 2 | \n",
" CMTech has designed a game to foster social in... | \n",
"
\n",
" \n",
" 3 | \n",
" A Storyteller's Point of View\\nMy\\nWriting\\nLe... | \n",
"
\n",
" \n",
" 4 | \n",
" mspu.us was registered 1 decade 3 years ago. I... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Text\n",
"0 […]\\nM&S bank […]\\nLowest unsecured loan rate...\n",
"1 JavaScript seems to be disabled in your browse...\n",
"2 CMTech has designed a game to foster social in...\n",
"3 A Storyteller's Point of View\\nMy\\nWriting\\nLe...\n",
"4 mspu.us was registered 1 decade 3 years ago. I..."
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1.head()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "641c606f-6f7f-4097-a8de-a9f6be0047b1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Text | \n",
"
\n",
" \n",
" \n",
" \n",
" 48995 | \n",
" A Chenango County man was charged Wednesday wi... | \n",
"
\n",
" \n",
" 48996 | \n",
" 2-Tone Black Personalized Embroidered One Init... | \n",
"
\n",
" \n",
" 48997 | \n",
" NARAL Pro-Choice America PAC Endorses Colleen ... | \n",
"
\n",
" \n",
" 48998 | \n",
" Posts Tagged by Thomas Paine\\nAEI Hosts Peter ... | \n",
"
\n",
" \n",
" 48999 | \n",
" Pantry feeds families in need\\n- Details\\n- Ca... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Text\n",
"48995 A Chenango County man was charged Wednesday wi...\n",
"48996 2-Tone Black Personalized Embroidered One Init...\n",
"48997 NARAL Pro-Choice America PAC Endorses Colleen ...\n",
"48998 Posts Tagged by Thomas Paine\\nAEI Hosts Peter ...\n",
"48999 Pantry feeds families in need\\n- Details\\n- Ca..."
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1.tail()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b8f7dbf6-5d74-4f8f-85d0-e890a5b8d152",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}