ksvmuralidhar commited on
Commit
9b397f8
·
verified ·
1 Parent(s): 710aeb7

Upload insert_into_db_sent_tran.ipynb

Browse files
Files changed (1) hide show
  1. insert_into_db_sent_tran.ipynb +514 -0
insert_into_db_sent_tran.ipynb ADDED
@@ -0,0 +1,514 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "1aafbf18-de38-4fcf-8245-e2e9a584971f",
7
+ "metadata": {
8
+ "tags": []
9
+ },
10
+ "outputs": [],
11
+ "source": [
12
+ "# ! pip install pymilvus==2.3.4\n",
13
+ "# ! pip install pyarrow==12.0.0\n",
14
+ "# !pip install -U sentence-transformers"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 2,
20
+ "id": "f1d8f101-f51b-4a50-b150-86e87c50c453",
21
+ "metadata": {
22
+ "tags": []
23
+ },
24
+ "outputs": [],
25
+ "source": [
26
+ "import numpy as np\n",
27
+ "import tensorflow as tf\n",
28
+ "from tqdm import tqdm\n",
29
+ "from dotenv import load_dotenv\n",
30
+ "import os\n",
31
+ "import pandas as pd\n",
32
+ "from pymilvus import connections, utility\n",
33
+ "from pymilvus import Collection, DataType, FieldSchema, CollectionSchema\n",
34
+ "import multiprocessing\n",
35
+ "from sentence_transformers import SentenceTransformer"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": null,
41
+ "id": "4ad4e3ac-9685-4f12-8043-5fbcc373d3e1",
42
+ "metadata": {},
43
+ "outputs": [],
44
+ "source": [
45
+ "tf.config.list_physical_devices('GPU')"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 3,
51
+ "id": "da71d832-b8a7-452b-b736-538a3c069b54",
52
+ "metadata": {
53
+ "tags": []
54
+ },
55
+ "outputs": [
56
+ {
57
+ "data": {
58
+ "text/html": [
59
+ "<div>\n",
60
+ "<style scoped>\n",
61
+ " .dataframe tbody tr th:only-of-type {\n",
62
+ " vertical-align: middle;\n",
63
+ " }\n",
64
+ "\n",
65
+ " .dataframe tbody tr th {\n",
66
+ " vertical-align: top;\n",
67
+ " }\n",
68
+ "\n",
69
+ " .dataframe thead th {\n",
70
+ " text-align: right;\n",
71
+ " }\n",
72
+ "</style>\n",
73
+ "<table border=\"1\" class=\"dataframe\">\n",
74
+ " <thead>\n",
75
+ " <tr style=\"text-align: right;\">\n",
76
+ " <th></th>\n",
77
+ " <th>index</th>\n",
78
+ " <th>category</th>\n",
79
+ " <th>short_description</th>\n",
80
+ " </tr>\n",
81
+ " </thead>\n",
82
+ " <tbody>\n",
83
+ " <tr>\n",
84
+ " <th>0</th>\n",
85
+ " <td>0</td>\n",
86
+ " <td>SCIENCE</td>\n",
87
+ " <td>A closer look at water-splitting's solar fuel ...</td>\n",
88
+ " </tr>\n",
89
+ " <tr>\n",
90
+ " <th>1</th>\n",
91
+ " <td>1</td>\n",
92
+ " <td>SCIENCE</td>\n",
93
+ " <td>An irresistible scent makes locusts swarm, stu...</td>\n",
94
+ " </tr>\n",
95
+ " <tr>\n",
96
+ " <th>2</th>\n",
97
+ " <td>2</td>\n",
98
+ " <td>SCIENCE</td>\n",
99
+ " <td>Artificial intelligence warning: AI will know ...</td>\n",
100
+ " </tr>\n",
101
+ " <tr>\n",
102
+ " <th>3</th>\n",
103
+ " <td>3</td>\n",
104
+ " <td>SCIENCE</td>\n",
105
+ " <td>Glaciers Could Have Sculpted Mars Valleys: Study</td>\n",
106
+ " </tr>\n",
107
+ " <tr>\n",
108
+ " <th>4</th>\n",
109
+ " <td>4</td>\n",
110
+ " <td>SCIENCE</td>\n",
111
+ " <td>Perseid meteor shower 2020: What time and how ...</td>\n",
112
+ " </tr>\n",
113
+ " <tr>\n",
114
+ " <th>...</th>\n",
115
+ " <td>...</td>\n",
116
+ " <td>...</td>\n",
117
+ " <td>...</td>\n",
118
+ " </tr>\n",
119
+ " <tr>\n",
120
+ " <th>311171</th>\n",
121
+ " <td>311171</td>\n",
122
+ " <td>TECH</td>\n",
123
+ " <td>RIM CEO Thorsten Heins' 'Significant' Plans Fo...</td>\n",
124
+ " </tr>\n",
125
+ " <tr>\n",
126
+ " <th>311172</th>\n",
127
+ " <td>311172</td>\n",
128
+ " <td>SPORTS</td>\n",
129
+ " <td>Maria Sharapova Stunned By Victoria Azarenka I...</td>\n",
130
+ " </tr>\n",
131
+ " <tr>\n",
132
+ " <th>311173</th>\n",
133
+ " <td>311173</td>\n",
134
+ " <td>SPORTS</td>\n",
135
+ " <td>Giants Over Patriots, Jets Over Colts Among M...</td>\n",
136
+ " </tr>\n",
137
+ " <tr>\n",
138
+ " <th>311174</th>\n",
139
+ " <td>311174</td>\n",
140
+ " <td>SPORTS</td>\n",
141
+ " <td>Aldon Smith Arrested: 49ers Linebacker Busted ...</td>\n",
142
+ " </tr>\n",
143
+ " <tr>\n",
144
+ " <th>311175</th>\n",
145
+ " <td>311175</td>\n",
146
+ " <td>SPORTS</td>\n",
147
+ " <td>Dwight Howard Rips Teammates After Magic Loss ...</td>\n",
148
+ " </tr>\n",
149
+ " </tbody>\n",
150
+ "</table>\n",
151
+ "<p>311176 rows × 3 columns</p>\n",
152
+ "</div>"
153
+ ],
154
+ "text/plain": [
155
+ " index category short_description\n",
156
+ "0 0 SCIENCE A closer look at water-splitting's solar fuel ...\n",
157
+ "1 1 SCIENCE An irresistible scent makes locusts swarm, stu...\n",
158
+ "2 2 SCIENCE Artificial intelligence warning: AI will know ...\n",
159
+ "3 3 SCIENCE Glaciers Could Have Sculpted Mars Valleys: Study\n",
160
+ "4 4 SCIENCE Perseid meteor shower 2020: What time and how ...\n",
161
+ "... ... ... ...\n",
162
+ "311171 311171 TECH RIM CEO Thorsten Heins' 'Significant' Plans Fo...\n",
163
+ "311172 311172 SPORTS Maria Sharapova Stunned By Victoria Azarenka I...\n",
164
+ "311173 311173 SPORTS Giants Over Patriots, Jets Over Colts Among M...\n",
165
+ "311174 311174 SPORTS Aldon Smith Arrested: 49ers Linebacker Busted ...\n",
166
+ "311175 311175 SPORTS Dwight Howard Rips Teammates After Magic Loss ...\n",
167
+ "\n",
168
+ "[311176 rows x 3 columns]"
169
+ ]
170
+ },
171
+ "execution_count": 3,
172
+ "metadata": {},
173
+ "output_type": "execute_result"
174
+ }
175
+ ],
176
+ "source": [
177
+ "data = pd.read_csv('labelled_newscatcher_dataset.csv', sep=\";\", usecols=['title', 'topic'])\n",
178
+ "json_data=pd.read_json('News_Category_Dataset_v3.json', lines=True)\n",
179
+ "data.drop_duplicates(subset=['title'], inplace=True)\n",
180
+ "json_data.drop_duplicates(subset=['headline'], inplace=True)\n",
181
+ "json_data = json_data[['headline', 'category']].copy()\n",
182
+ "json_data.rename(columns={'headline': 'title'}, inplace=True)\n",
183
+ "data.rename(columns={'topic': 'category'}, inplace=True)\n",
184
+ "data = pd.concat([data, json_data], axis=0)\n",
185
+ "data.drop_duplicates(subset=['title'], inplace=True)\n",
186
+ "data.reset_index(drop=True, inplace=True)\n",
187
+ "data.reset_index(inplace=True)\n",
188
+ "data.rename(columns={'title': 'short_description'}, inplace=True)\n",
189
+ "data"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "code",
194
+ "execution_count": 4,
195
+ "id": "796f85b1-12dc-42cb-b431-65c88738b607",
196
+ "metadata": {},
197
+ "outputs": [
198
+ {
199
+ "data": {
200
+ "text/plain": [
201
+ "False"
202
+ ]
203
+ },
204
+ "execution_count": 4,
205
+ "metadata": {},
206
+ "output_type": "execute_result"
207
+ }
208
+ ],
209
+ "source": [
210
+ "any(data['short_description'].duplicated())"
211
+ ]
212
+ },
213
+ {
214
+ "cell_type": "code",
215
+ "execution_count": 5,
216
+ "id": "5f46251b-156a-4a72-ab89-6abb6d810006",
217
+ "metadata": {
218
+ "tags": []
219
+ },
220
+ "outputs": [],
221
+ "source": [
222
+ "data.to_csv('news_processed.csv', index=False)"
223
+ ]
224
+ },
225
+ {
226
+ "cell_type": "code",
227
+ "execution_count": 7,
228
+ "id": "1463ea34-4447-464a-b0a3-da5e09892a09",
229
+ "metadata": {},
230
+ "outputs": [],
231
+ "source": [
232
+ "class TextVectorizer:\n",
233
+ " '''\n",
234
+ " sentence transformers to extract sentence embeddings\n",
235
+ " '''\n",
236
+ " def vectorize(self, x):\n",
237
+ " sent_model = SentenceTransformer('all-mpnet-base-v2')\n",
238
+ " sen_embeddings = sent_model.encode(x)\n",
239
+ " return sen_embeddings"
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "code",
244
+ "execution_count": 8,
245
+ "id": "47a714f3-8948-470b-9caf-93ed2bbf4894",
246
+ "metadata": {
247
+ "tags": []
248
+ },
249
+ "outputs": [],
250
+ "source": [
251
+ "vectorizer = TextVectorizer()"
252
+ ]
253
+ },
254
+ {
255
+ "cell_type": "code",
256
+ "execution_count": 9,
257
+ "id": "8b1586e5-2923-4632-a3db-fd2364124d6f",
258
+ "metadata": {
259
+ "tags": []
260
+ },
261
+ "outputs": [
262
+ {
263
+ "data": {
264
+ "text/plain": [
265
+ "320"
266
+ ]
267
+ },
268
+ "execution_count": 9,
269
+ "metadata": {},
270
+ "output_type": "execute_result"
271
+ }
272
+ ],
273
+ "source": [
274
+ "# getting max length of article descriptions to be used for VARCHAR while defining schema\n",
275
+ "max_desc_len = max([len(s) for s in data['short_description']])\n",
276
+ "max_desc_len"
277
+ ]
278
+ },
279
+ {
280
+ "cell_type": "code",
281
+ "execution_count": 10,
282
+ "id": "debe0ef4-b877-495a-872e-47f720b758a9",
283
+ "metadata": {},
284
+ "outputs": [
285
+ {
286
+ "data": {
287
+ "text/plain": [
288
+ "14"
289
+ ]
290
+ },
291
+ "execution_count": 10,
292
+ "metadata": {},
293
+ "output_type": "execute_result"
294
+ }
295
+ ],
296
+ "source": [
297
+ "# getting max length of article categories to be used for VARCHAR while defining schema\n",
298
+ "max_cat_len = max([len(s) for s in data['category']])\n",
299
+ "max_cat_len"
300
+ ]
301
+ },
302
+ {
303
+ "cell_type": "code",
304
+ "execution_count": 11,
305
+ "id": "80489f00-e59f-46ab-a933-97145928176c",
306
+ "metadata": {
307
+ "tags": []
308
+ },
309
+ "outputs": [],
310
+ "source": [
311
+ "# # Reading milvus URI & API token from secrets.env\n",
312
+ "load_dotenv('secrets.env')\n",
313
+ "uri = os.environ.get(\"URI\")\n",
314
+ "token = os.environ.get(\"TOKEN\")"
315
+ ]
316
+ },
317
+ {
318
+ "cell_type": "code",
319
+ "execution_count": 12,
320
+ "id": "0bf69f22-e113-43a5-be81-77224cafd856",
321
+ "metadata": {
322
+ "tags": []
323
+ },
324
+ "outputs": [
325
+ {
326
+ "name": "stdout",
327
+ "output_type": "stream",
328
+ "text": [
329
+ "Connected to DB\n"
330
+ ]
331
+ }
332
+ ],
333
+ "source": [
334
+ "connections.connect(\"default\", uri=uri, token=token)\n",
335
+ "print(f\"Connected to DB\")"
336
+ ]
337
+ },
338
+ {
339
+ "cell_type": "code",
340
+ "execution_count": 13,
341
+ "id": "8da06a3b-2005-4c02-a168-dc84bcde7064",
342
+ "metadata": {
343
+ "tags": []
344
+ },
345
+ "outputs": [],
346
+ "source": [
347
+ "collection_name = 'news_collection_sent_tran'\n",
348
+ "check_collection = utility.has_collection(collection_name)"
349
+ ]
350
+ },
351
+ {
352
+ "cell_type": "code",
353
+ "execution_count": 14,
354
+ "id": "33342612-1380-4d1a-a8e7-931476e07979",
355
+ "metadata": {
356
+ "tags": []
357
+ },
358
+ "outputs": [
359
+ {
360
+ "name": "stdout",
361
+ "output_type": "stream",
362
+ "text": [
363
+ "Droped Existing collection\n"
364
+ ]
365
+ }
366
+ ],
367
+ "source": [
368
+ "if check_collection:\n",
369
+ " drop_result = utility.drop_collection(collection_name)\n",
370
+ " print(\"Droped Existing collection\")"
371
+ ]
372
+ },
373
+ {
374
+ "cell_type": "code",
375
+ "execution_count": 15,
376
+ "id": "fc8ae048-d586-41e7-9678-75e1752c1693",
377
+ "metadata": {
378
+ "tags": []
379
+ },
380
+ "outputs": [
381
+ {
382
+ "name": "stdout",
383
+ "output_type": "stream",
384
+ "text": [
385
+ "Creating the collection\n",
386
+ "Schema: {'auto_id': False, 'description': 'collection of news articles', 'fields': [{'name': 'article_id', 'description': 'primary id', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'article_embed', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}, {'name': 'article_desc', 'description': 'short description of the article', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 370}}, {'name': 'article_category', 'description': 'category of the article', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 64}}]}\n",
387
+ "Success!\n"
388
+ ]
389
+ }
390
+ ],
391
+ "source": [
392
+ "# Creating collection schema\n",
393
+ "dim = 768 # embeddings dim\n",
394
+ "article_id = FieldSchema(name=\"article_id\", dtype=DataType.INT64, is_primary=True, description=\"primary id\") # primary key\n",
395
+ "article_embed_field = FieldSchema(name=\"article_embed\", dtype=DataType.FLOAT_VECTOR, dim=dim) # description embeddings\n",
396
+ "article_desc = FieldSchema(name=\"article_desc\", dtype=DataType.VARCHAR, max_length=(max_desc_len + 50), # using max_desc_len to specify VARCHAR len \n",
397
+ " is_primary=False, description=\"short description of the article\") # short description of article\n",
398
+ "article_cat = FieldSchema(name=\"article_category\", dtype=DataType.VARCHAR, max_length=(max_cat_len + 50), # using max_desc_len to specify VARCHAR len \n",
399
+ " is_primary=False, description=\"category of the article\") # category of article\n",
400
+ "schema = CollectionSchema(fields=[article_id, article_embed_field, article_desc, article_cat], \n",
401
+ " auto_id=False, description=\"collection of news articles\")\n",
402
+ "print(f\"Creating the collection\")\n",
403
+ "collection = Collection(name=collection_name, schema=schema)\n",
404
+ "print(f\"Schema: {schema}\")\n",
405
+ "print(\"Success!\")"
406
+ ]
407
+ },
408
+ {
409
+ "cell_type": "code",
410
+ "execution_count": 16,
411
+ "id": "cca82380-98f6-4c44-aac6-86d4ae3484d0",
412
+ "metadata": {},
413
+ "outputs": [
414
+ {
415
+ "name": "stdout",
416
+ "output_type": "stream",
417
+ "text": [
418
+ "[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 25000, 26000, 27000, 28000, 29000, 30000, 31000, 32000, 33000, 34000, 35000, 36000, 37000, 38000, 39000, 40000, 41000, 42000, 43000, 44000, 45000, 46000, 47000, 48000, 49000, 50000, 51000, 52000, 53000, 54000, 55000, 56000, 57000, 58000, 59000, 60000, 61000, 62000, 63000, 64000, 65000, 66000, 67000, 68000, 69000, 70000, 71000, 72000, 73000, 74000, 75000, 76000, 77000, 78000, 79000, 80000, 81000, 82000, 83000, 84000, 85000, 86000, 87000, 88000, 89000, 90000, 91000, 92000, 93000, 94000, 95000, 96000, 97000, 98000, 99000, 100000, 101000, 102000, 103000, 104000, 105000, 106000, 107000, 108000, 109000, 110000, 111000, 112000, 113000, 114000, 115000, 116000, 117000, 118000, 119000, 120000, 121000, 122000, 123000, 124000, 125000, 126000, 127000, 128000, 129000, 130000, 131000, 132000, 133000, 134000, 135000, 136000, 137000, 138000, 139000, 140000, 141000, 142000, 143000, 144000, 145000, 146000, 147000, 148000, 149000, 150000, 151000, 152000, 153000, 154000, 155000, 156000, 157000, 158000, 159000, 160000, 161000, 162000, 163000, 164000, 165000, 166000, 167000, 168000, 169000, 170000, 171000, 172000, 173000, 174000, 175000, 176000, 177000, 178000, 179000, 180000, 181000, 182000, 183000, 184000, 185000, 186000, 187000, 188000, 189000, 190000, 191000, 192000, 193000, 194000, 195000, 196000, 197000, 198000, 199000, 200000, 201000, 202000, 203000, 204000, 205000, 206000, 207000, 208000, 209000, 210000, 211000, 212000, 213000, 214000, 215000, 216000, 217000, 218000, 219000, 220000, 221000, 222000, 223000, 224000, 225000, 226000, 227000, 228000, 229000, 230000, 231000, 232000, 233000, 234000, 235000, 236000, 237000, 238000, 239000, 240000, 241000, 242000, 243000, 244000, 245000, 246000, 247000, 248000, 249000, 250000, 251000, 252000, 253000, 254000, 255000, 256000, 257000, 258000, 259000, 260000, 261000, 262000, 263000, 264000, 265000, 266000, 267000, 268000, 269000, 270000, 271000, 272000, 273000, 274000, 275000, 276000, 277000, 278000, 279000, 280000, 281000, 282000, 283000, 284000, 285000, 286000, 287000, 288000, 289000, 290000, 291000, 292000, 293000, 294000, 295000, 296000, 297000, 298000, 299000, 300000, 301000, 302000, 303000, 304000, 305000, 306000, 307000, 308000, 309000, 310000, 311000, 311176]\n"
419
+ ]
420
+ }
421
+ ],
422
+ "source": [
423
+ "cuts = [*range(0, len(data), 1000)]\n",
424
+ "cuts.append(len(data))\n",
425
+ "print(cuts)"
426
+ ]
427
+ },
428
+ {
429
+ "cell_type": "code",
430
+ "execution_count": 17,
431
+ "id": "e28b2351-e333-44e8-bac4-96686abda113",
432
+ "metadata": {},
433
+ "outputs": [
434
+ {
435
+ "data": {
436
+ "text/plain": [
437
+ "8"
438
+ ]
439
+ },
440
+ "execution_count": 17,
441
+ "metadata": {},
442
+ "output_type": "execute_result"
443
+ }
444
+ ],
445
+ "source": [
446
+ "multiprocessing.cpu_count()"
447
+ ]
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "execution_count": null,
452
+ "id": "066c67ac-01a6-4151-8e85-5869ddce1c0a",
453
+ "metadata": {},
454
+ "outputs": [],
455
+ "source": [
456
+ "article_id = []\n",
457
+ "article_desc = []\n",
458
+ "article_embed = []\n",
459
+ "article_cat = []\n",
460
+ "try:\n",
461
+ " for i in tqdm(range(len(cuts)-1)):\n",
462
+ " df = data.iloc[cuts[i]: cuts[i+1]].copy()\n",
463
+ " article_id = [*df['index']]\n",
464
+ " article_desc = [*df['short_description']]\n",
465
+ " article_cat = [*df['category']]\n",
466
+ " results = []\n",
467
+ " article_embed = vectorizer.vectorize(article_desc)\n",
468
+ " docs = [article_id, article_embed, article_desc, article_cat]\n",
469
+ " ins_resp = collection.insert(docs)\n",
470
+ " print(ins_resp)\n",
471
+ " article_id = []\n",
472
+ " article_desc = []\n",
473
+ " article_embed = []\n",
474
+ " article_cat = []\n",
475
+ " if i == 0:\n",
476
+ " index_params = {\"index_type\": \"AUTOINDEX\", \"metric_type\": \"L2\", \"params\": {}} \n",
477
+ " collection.create_index(field_name='article_embed', index_params=index_params)\n",
478
+ " collection = Collection(name=collection_name)\n",
479
+ " collection.load()\n",
480
+ "except:\n",
481
+ " raise"
482
+ ]
483
+ },
484
+ {
485
+ "cell_type": "code",
486
+ "execution_count": null,
487
+ "id": "d50177fa-fd0c-48ad-bc9b-a7bdc826a628",
488
+ "metadata": {},
489
+ "outputs": [],
490
+ "source": []
491
+ }
492
+ ],
493
+ "metadata": {
494
+ "kernelspec": {
495
+ "display_name": "Python (tf_gpu)",
496
+ "language": "python",
497
+ "name": "tf_gpu"
498
+ },
499
+ "language_info": {
500
+ "codemirror_mode": {
501
+ "name": "ipython",
502
+ "version": 3
503
+ },
504
+ "file_extension": ".py",
505
+ "mimetype": "text/x-python",
506
+ "name": "python",
507
+ "nbconvert_exporter": "python",
508
+ "pygments_lexer": "ipython3",
509
+ "version": "3.9.18"
510
+ }
511
+ },
512
+ "nbformat": 4,
513
+ "nbformat_minor": 5
514
+ }