nsthorat-lilac commited on
Commit
be17f2e
·
1 Parent(s): 1648180

Upload data/lilac.yml with huggingface_hub

Browse files
Files changed (1) hide show
  1. data/lilac.yml +27 -0
data/lilac.yml CHANGED
@@ -11,6 +11,10 @@ datasets:
11
  embeddings:
12
  - path: premise
13
  embedding: gte-small
 
 
 
 
14
  signals:
15
  - path: premise
16
  signal:
@@ -25,6 +29,7 @@ datasets:
25
  ui:
26
  media_paths:
27
  - premise
 
28
  - namespace: local
29
  name: glue_ax
30
  source:
@@ -66,6 +71,7 @@ datasets:
66
  ui:
67
  media_paths:
68
  - hypothesis
 
69
  - namespace: local
70
  name: imdb3
71
  source:
@@ -75,6 +81,7 @@ datasets:
75
  ui:
76
  media_paths:
77
  - text
 
78
  - namespace: local
79
  name: imdb
80
  source:
@@ -94,6 +101,7 @@ datasets:
94
  ui:
95
  media_paths:
96
  - text
 
97
  - namespace: local
98
  name: imdb2
99
  source:
@@ -103,6 +111,7 @@ datasets:
103
  ui:
104
  media_paths:
105
  - text
 
106
  - namespace: lilac
107
  name: OpenOrca-100k
108
  source:
@@ -235,11 +244,15 @@ datasets:
235
  - path: response
236
  signal:
237
  signal_name: text_statistics
 
 
 
238
  settings:
239
  ui:
240
  media_paths:
241
  - question
242
  - response
 
243
  - namespace: local
244
  name: the_movies_dataset
245
  source:
@@ -251,6 +264,7 @@ datasets:
251
  ui:
252
  media_paths:
253
  - overview
 
254
  - namespace: local
255
  name: glue_ax_parquet
256
  source:
@@ -261,6 +275,7 @@ datasets:
261
  ui:
262
  media_paths:
263
  - premise
 
264
  - namespace: lilac
265
  name: mmlu_professional_law
266
  source:
@@ -425,6 +440,7 @@ datasets:
425
  - question
426
  - - choices
427
  - '*'
 
428
  preferred_embedding: gte-small
429
  - namespace: local
430
  name: deepset-prompt-inj
@@ -438,6 +454,7 @@ datasets:
438
  ui:
439
  media_paths:
440
  - text
 
441
  - namespace: local
442
  name: jasper-prompt-inj
443
  source:
@@ -450,6 +467,7 @@ datasets:
450
  ui:
451
  media_paths:
452
  - text
 
453
  - namespace: local
454
  name: mosaic-chat-v2
455
  source:
@@ -550,6 +568,7 @@ datasets:
550
  media_paths:
551
  - prompt
552
  - response
 
553
  preferred_embedding: gte-small
554
  - namespace: local
555
  name: databricks-dolly-15k-curated-en
@@ -564,6 +583,8 @@ datasets:
564
  - value
565
  - '*'
566
  embedding: gte-small
 
 
567
  signals:
568
  - path: original-instruction
569
  signal:
@@ -793,6 +814,9 @@ datasets:
793
  - '*'
794
  signal:
795
  signal_name: text_statistics
 
 
 
796
  settings:
797
  ui:
798
  media_paths:
@@ -808,6 +832,7 @@ datasets:
808
  - - new-response
809
  - value
810
  - '*'
 
811
  preferred_embedding: gte-small
812
  - namespace: local
813
  name: open-asssistant-conversations
@@ -888,6 +913,7 @@ datasets:
888
  ui:
889
  media_paths:
890
  - text
 
891
  preferred_embedding: gte-small
892
  - namespace: local
893
  name: enron-emails
@@ -964,4 +990,5 @@ datasets:
964
  ui:
965
  media_paths:
966
  - text
 
967
  preferred_embedding: gte-small
 
11
  embeddings:
12
  - path: premise
13
  embedding: gte-small
14
+ - path: premise
15
+ embedding: gte-base
16
+ - path: hypothesis
17
+ embedding: gte-small
18
  signals:
19
  - path: premise
20
  signal:
 
29
  ui:
30
  media_paths:
31
  - premise
32
+ markdown_paths: []
33
  - namespace: local
34
  name: glue_ax
35
  source:
 
71
  ui:
72
  media_paths:
73
  - hypothesis
74
+ markdown_paths: []
75
  - namespace: local
76
  name: imdb3
77
  source:
 
81
  ui:
82
  media_paths:
83
  - text
84
+ markdown_paths: []
85
  - namespace: local
86
  name: imdb
87
  source:
 
101
  ui:
102
  media_paths:
103
  - text
104
+ markdown_paths: []
105
  - namespace: local
106
  name: imdb2
107
  source:
 
111
  ui:
112
  media_paths:
113
  - text
114
+ markdown_paths: []
115
  - namespace: lilac
116
  name: OpenOrca-100k
117
  source:
 
244
  - path: response
245
  signal:
246
  signal_name: text_statistics
247
+ - path: system_prompt
248
+ signal:
249
+ signal_name: pii
250
  settings:
251
  ui:
252
  media_paths:
253
  - question
254
  - response
255
+ markdown_paths: []
256
  - namespace: local
257
  name: the_movies_dataset
258
  source:
 
264
  ui:
265
  media_paths:
266
  - overview
267
+ markdown_paths: []
268
  - namespace: local
269
  name: glue_ax_parquet
270
  source:
 
275
  ui:
276
  media_paths:
277
  - premise
278
+ markdown_paths: []
279
  - namespace: lilac
280
  name: mmlu_professional_law
281
  source:
 
440
  - question
441
  - - choices
442
  - '*'
443
+ markdown_paths: []
444
  preferred_embedding: gte-small
445
  - namespace: local
446
  name: deepset-prompt-inj
 
454
  ui:
455
  media_paths:
456
  - text
457
+ markdown_paths: []
458
  - namespace: local
459
  name: jasper-prompt-inj
460
  source:
 
467
  ui:
468
  media_paths:
469
  - text
470
+ markdown_paths: []
471
  - namespace: local
472
  name: mosaic-chat-v2
473
  source:
 
568
  media_paths:
569
  - prompt
570
  - response
571
+ markdown_paths: []
572
  preferred_embedding: gte-small
573
  - namespace: local
574
  name: databricks-dolly-15k-curated-en
 
583
  - value
584
  - '*'
585
  embedding: gte-small
586
+ - path: original-instruction
587
+ embedding: gte-small
588
  signals:
589
  - path: original-instruction
590
  signal:
 
814
  - '*'
815
  signal:
816
  signal_name: text_statistics
817
+ - path: original-instruction
818
+ signal:
819
+ signal_name: spacy_ner
820
  settings:
821
  ui:
822
  media_paths:
 
832
  - - new-response
833
  - value
834
  - '*'
835
+ markdown_paths: []
836
  preferred_embedding: gte-small
837
  - namespace: local
838
  name: open-asssistant-conversations
 
913
  ui:
914
  media_paths:
915
  - text
916
+ markdown_paths: []
917
  preferred_embedding: gte-small
918
  - namespace: local
919
  name: enron-emails
 
990
  ui:
991
  media_paths:
992
  - text
993
+ markdown_paths: []
994
  preferred_embedding: gte-small