TroyDoesAI commited on
Commit
384475e
·
verified ·
1 Parent(s): 0d68b58

Some very light instruct tuning for context obedience but mostly pretraining data

Browse files

- The model is showing signs of really understanding what the user is saying deeply, lets see if it continues to improve as I grokk this pretraining data to convergence for the next 2 weeks lol!

config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": ".\\BlackSheep-MoE",
3
  "architectures": [
4
  "MixtralForCausalLM"
5
  ],
 
1
  {
2
+ "_name_or_path": ".\\BlackSheep",
3
  "architectures": [
4
  "MixtralForCausalLM"
5
  ],
model-00001-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65ee3280167f66c7a9351cbbb8cd9e4b78cbc9f787ea0d4f267dad08ea3db45c
3
  size 4991365712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9feb3cb1f21af9f9f99a7a9cdbc743038df76a41a44e576f5f6f33b602b27484
3
  size 4991365712
model-00002-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cba13a57d985d1f0e73dbb113aee6571529da17cd10df1c23f54a8096b4444a7
3
  size 4995716272
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d10dd440267b2650ca251d69b08f08078e229d678ec5ff8e73c4b1f594fd725
3
  size 4995716272
model-00003-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f25d6a0f49b75f419be5ae2b79e7ddf7dd4129b9ebd26f02929157173926324
3
  size 4957942672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3e7aa064caf67c5c6955c3577ca756605dd1f3cb82f77bebbf183e65ac6d5c8
3
  size 4957942672
model-00004-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9e8c75264900005fa93a33558fd04c9c385d0fadbe0372d87171c6a05a7b8fa
3
  size 4995704152
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:865cd498e87b6758323f9f1fc7fcde07563eeaa09f6f5acd0bcf8ba3326d15b1
3
  size 4995704152
model-00005-of-00005.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad8a108967cdebda6ad53213551169d5f555c349fbf5d6cbc691651f22a8e617
3
  size 2197808096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfe1e19cfe278adf34eff4c60d891f1954bcd23bb68a85a794ade83e11fb0cf7
3
  size 2197808096
traininglog.txt ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Step: 4 {'loss': 0.6225, 'grad_norm': 0.5366921424865723, 'learning_rate': 3.9999553115423906e-05, 'epoch': 0.000268802752540186}
2
+ Step: 9 {'loss': 0.5018, 'grad_norm': 0.2231733202934265, 'learning_rate': 3.99991062308478e-05, 'epoch': 0.000537605505080372}
3
+ Step: 14 {'loss': 0.6071, 'grad_norm': 0.6181017756462097, 'learning_rate': 3.999865934627171e-05, 'epoch': 0.000806408257620558}
4
+ Step: 19 {'loss': 0.3221, 'grad_norm': 0.2883134186267853, 'learning_rate': 3.9998212461695615e-05, 'epoch': 0.001075211010160744}
5
+ Step: 24 {'loss': 0.6598, 'grad_norm': 0.6283852458000183, 'learning_rate': 3.999776557711951e-05, 'epoch': 0.00134401376270093}
6
+ Step: 29 {'loss': 0.3794, 'grad_norm': 0.7882901430130005, 'learning_rate': 3.9997318692543414e-05, 'epoch': 0.001612816515241116}
7
+ Step: 34 {'loss': 0.4322, 'grad_norm': 0.29714861512184143, 'learning_rate': 3.999687180796732e-05, 'epoch': 0.001881619267781302}
8
+ Step: 39 {'loss': 0.6519, 'grad_norm': 0.8262774348258972, 'learning_rate': 3.999642492339122e-05, 'epoch': 0.002150422020321488}
9
+ Step: 44 {'loss': 0.5566, 'grad_norm': 0.18315744400024414, 'learning_rate': 3.999597803881512e-05, 'epoch': 0.002419224772861674}
10
+ Step: 49 {'loss': 0.5166, 'grad_norm': 0.2871042788028717, 'learning_rate': 3.9995531154239026e-05, 'epoch': 0.00268802752540186}
11
+ Step: 54 {'loss': 0.4397, 'grad_norm': 0.3087766468524933, 'learning_rate': 3.999508426966292e-05, 'epoch': 0.0029568302779420462}
12
+ Step: 59 {'loss': 0.5999, 'grad_norm': 0.3319031596183777, 'learning_rate': 3.9994637385086825e-05, 'epoch': 0.003225633030482232}
13
+ Step: 64 {'loss': 0.4279, 'grad_norm': 0.152162566781044, 'learning_rate': 3.999419050051073e-05, 'epoch': 0.0034944357830224183}
14
+ Step: 69 {'loss': 0.4612, 'grad_norm': 0.4317966103553772, 'learning_rate': 3.999374361593463e-05, 'epoch': 0.003763238535562604}
15
+ Step: 74 {'loss': 0.702, 'grad_norm': 0.4360438287258148, 'learning_rate': 3.9993296731358534e-05, 'epoch': 0.00403204128810279}
16
+ Step: 79 {'loss': 0.3772, 'grad_norm': 0.2938617467880249, 'learning_rate': 3.999284984678243e-05, 'epoch': 0.004300844040642976}
17
+ Step: 84 {'loss': 0.5677, 'grad_norm': 0.41090044379234314, 'learning_rate': 3.9992402962206334e-05, 'epoch': 0.0045696467931831625}
18
+ Step: 89 {'loss': 0.2434, 'grad_norm': 0.2661038041114807, 'learning_rate': 3.9991956077630236e-05, 'epoch': 0.004838449545723348}
19
+ Step: 94 {'loss': 0.162, 'grad_norm': 0.31489041447639465, 'learning_rate': 3.999150919305414e-05, 'epoch': 0.005107252298263534}
20
+ Step: 99 {'loss': 0.323, 'grad_norm': 0.21471913158893585, 'learning_rate': 3.999106230847804e-05, 'epoch': 0.00537605505080372}
21
+ Step: 104 {'loss': 0.9207, 'grad_norm': 0.856233537197113, 'learning_rate': 3.9990615423901945e-05, 'epoch': 0.005644857803343907}
22
+ Step: 109 {'loss': 0.3481, 'grad_norm': 0.29812806844711304, 'learning_rate': 3.999016853932584e-05, 'epoch': 0.0059136605558840925}
23
+ Step: 114 {'loss': 0.6372, 'grad_norm': 0.4753149449825287, 'learning_rate': 3.9989721654749745e-05, 'epoch': 0.006182463308424278}
24
+ Step: 119 {'loss': 0.3034, 'grad_norm': 0.2815486490726471, 'learning_rate': 3.9989274770173654e-05, 'epoch': 0.006451266060964464}
25
+ Step: 124 {'loss': 0.5537, 'grad_norm': 0.601784884929657, 'learning_rate': 3.998882788559755e-05, 'epoch': 0.00672006881350465}
26
+ Step: 129 {'loss': 0.3721, 'grad_norm': 0.2789490520954132, 'learning_rate': 3.9988381001021454e-05, 'epoch': 0.006988871566044837}
27
+ Step: 134 {'loss': 0.5, 'grad_norm': 0.1338292360305786, 'learning_rate': 3.9987934116445356e-05, 'epoch': 0.0072576743185850225}
28
+ Step: 139 {'loss': 0.3479, 'grad_norm': 0.3812941312789917, 'learning_rate': 3.998748723186926e-05, 'epoch': 0.007526477071125208}
29
+ Step: 144 {'loss': 0.4069, 'grad_norm': 0.11460933834314346, 'learning_rate': 3.998704034729316e-05, 'epoch': 0.007795279823665394}
30
+ Step: 149 {'loss': 0.2768, 'grad_norm': 0.22373315691947937, 'learning_rate': 3.9986593462717065e-05, 'epoch': 0.00806408257620558}
31
+ Step: 154 {'loss': 0.3149, 'grad_norm': 0.2564243972301483, 'learning_rate': 3.998614657814096e-05, 'epoch': 0.008332885328745767}
32
+ Step: 159 {'loss': 0.5705, 'grad_norm': 0.29972872138023376, 'learning_rate': 3.9985699693564865e-05, 'epoch': 0.008601688081285952}
33
+ Step: 164 {'loss': 0.7367, 'grad_norm': 0.9685312509536743, 'learning_rate': 3.998525280898877e-05, 'epoch': 0.008870490833826138}
34
+ Step: 169 {'loss': 0.5446, 'grad_norm': 0.258772075176239, 'learning_rate': 3.998480592441267e-05, 'epoch': 0.009139293586366325}
35
+ Step: 174 {'loss': 0.3431, 'grad_norm': 0.7926813364028931, 'learning_rate': 3.9984359039836574e-05, 'epoch': 0.00940809633890651}
36
+ Step: 179 {'loss': 0.5838, 'grad_norm': 0.7501709461212158, 'learning_rate': 3.9983912155260476e-05, 'epoch': 0.009676899091446697}
37
+ Step: 184 {'loss': 0.2948, 'grad_norm': 0.19987852871418, 'learning_rate': 3.998346527068437e-05, 'epoch': 0.009945701843986882}
38
+ Step: 189 {'loss': 0.3696, 'grad_norm': 0.10254683345556259, 'learning_rate': 3.9983018386108276e-05, 'epoch': 0.010214504596527068}
39
+ Step: 194 {'loss': 0.7386, 'grad_norm': 0.42202311754226685, 'learning_rate': 3.998257150153218e-05, 'epoch': 0.010483307349067255}
40
+ Step: 199 {'loss': 0.4241, 'grad_norm': 0.2576633393764496, 'learning_rate': 3.998212461695608e-05, 'epoch': 0.01075211010160744}
41
+ Step: 204 {'loss': 0.5949, 'grad_norm': 0.437381386756897, 'learning_rate': 3.9981677732379985e-05, 'epoch': 0.011020912854147627}
42
+ Step: 209 {'loss': 0.3349, 'grad_norm': 0.20532897114753723, 'learning_rate': 1.9981278894742503e-05, 'epoch': 0.022578217395978928}
43
+ Step: 214 {'loss': 0.4569, 'grad_norm': 0.1352977603673935, 'learning_rate': 1.9980833154141132e-05, 'epoch': 0.023115794000645094}
44
+ Step: 219 {'loss': 0.5782, 'grad_norm': 0.12661641836166382, 'learning_rate': 1.9980387413539764e-05, 'epoch': 0.023653370605311256}
45
+ Step: 224 {'loss': 0.479, 'grad_norm': 0.22078381478786469, 'learning_rate': 1.9979941672938397e-05, 'epoch': 0.02419094720997742}
46
+ Step: 229 {'loss': 0.4898, 'grad_norm': 0.1355533003807068, 'learning_rate': 1.9979495932337026e-05, 'epoch': 0.024728523814643587}
47
+ Step: 234 {'loss': 0.7018, 'grad_norm': 0.16286955773830414, 'learning_rate': 1.997905019173566e-05, 'epoch': 0.025266100419309753}
48
+ Step: 239 {'loss': 0.4878, 'grad_norm': 0.2710496187210083, 'learning_rate': 1.9978604451134288e-05, 'epoch': 0.025803677023975916}
49
+ Step: 244 {'loss': 0.4079, 'grad_norm': 0.16539081931114197, 'learning_rate': 1.997815871053292e-05, 'epoch': 0.02634125362864208}
50
+ Step: 249 {'loss': 0.5864, 'grad_norm': 0.14566001296043396, 'learning_rate': 1.997771296993155e-05, 'epoch': 0.026878830233308247}
51
+ Step: 254 {'loss': 0.4005, 'grad_norm': 0.12252970039844513, 'learning_rate': 1.997726722933018e-05, 'epoch': 0.027416406837974413}
52
+ Step: 259 {'loss': 0.5404, 'grad_norm': 0.24383878707885742, 'learning_rate': 1.997682148872881e-05, 'epoch': 0.027953983442640575}
53
+ Step: 264 {'loss': 0.3172, 'grad_norm': 0.2093472182750702, 'learning_rate': 1.9976375748127443e-05, 'epoch': 0.02849156004730674}
54
+ Step: 269 {'loss': 0.3715, 'grad_norm': 0.19576534628868103, 'learning_rate': 1.9975930007526076e-05, 'epoch': 0.029029136651972907}
55
+ Step: 274 {'loss': 0.303, 'grad_norm': 0.22658678889274597, 'learning_rate': 1.9975484266924705e-05, 'epoch': 0.029566713256639073}
56
+ Step: 279 {'loss': 0.4399, 'grad_norm': 0.17113405466079712, 'learning_rate': 1.9975038526323334e-05, 'epoch': 0.030104289861305235}
57
+ Step: 284 {'loss': 0.6118, 'grad_norm': 0.3955773711204529, 'learning_rate': 1.9974592785721967e-05, 'epoch': 0.0306418664659714}
58
+ Step: 289 {'loss': 0.537, 'grad_norm': 0.18520204722881317, 'learning_rate': 1.99741470451206e-05, 'epoch': 0.031179443070637566}
59
+ Step: 294 {'loss': 0.5581, 'grad_norm': 0.31099817156791687, 'learning_rate': 1.9973701304519228e-05, 'epoch': 0.03171701967530373}
60
+ Step: 299 {'loss': 0.5116, 'grad_norm': 0.14776164293289185, 'learning_rate': 1.997325556391786e-05, 'epoch': 0.0322545962799699}
61
+ Step: 304 {'loss': 0.63, 'grad_norm': 0.23881855607032776, 'learning_rate': 1.9972809823316493e-05, 'epoch': 0.03279217288463606}
62
+ Step: 309 {'loss': 0.3402, 'grad_norm': 0.17691396176815033, 'learning_rate': 1.9972364082715122e-05, 'epoch': 0.03332974948930222}
63
+ Step: 314 {'loss': 0.5772, 'grad_norm': 0.14926782250404358, 'learning_rate': 1.997191834211375e-05, 'epoch': 0.03386732609396839}
64
+ Step: 319 {'loss': 0.4408, 'grad_norm': 0.16565901041030884, 'learning_rate': 1.9971472601512384e-05, 'epoch': 0.034404902698634554}
65
+ Step: 324 {'loss': 0.5823, 'grad_norm': 0.21418456733226776, 'learning_rate': 1.9971026860911013e-05, 'epoch': 0.03494247930330072}
66
+ Step: 329 {'loss': 0.48, 'grad_norm': 0.14608466625213623, 'learning_rate': 1.9970581120309646e-05, 'epoch': 0.035480055907966886}
67
+ Step: 334 {'loss': 0.4705, 'grad_norm': 0.26669827103614807, 'learning_rate': 1.9970135379708278e-05, 'epoch': 0.03601763251263305}
68
+ Step: 339 {'loss': 0.4378, 'grad_norm': 0.33481690287590027, 'learning_rate': 1.9969689639106907e-05, 'epoch': 0.03655520911729922}
69
+ Step: 344 {'loss': 0.6453, 'grad_norm': 0.2946885824203491, 'learning_rate': 1.996924389850554e-05, 'epoch': 0.03709278572196538}
70
+ Step: 349 {'loss': 0.5199, 'grad_norm': 0.4816878139972687, 'learning_rate': 1.996879815790417e-05, 'epoch': 0.03763036232663154}
71
+ Step: 354 {'loss': 0.5932, 'grad_norm': 0.11819624155759811, 'learning_rate': 1.99683524173028e-05, 'epoch': 0.03816793893129771}
72
+ Step: 359 {'loss': 0.6298, 'grad_norm': 0.460887610912323, 'learning_rate': 1.996790667670143e-05, 'epoch': 0.03870551553596387}
73
+ Step: 364 {'loss': 0.6517, 'grad_norm': 0.31503042578697205, 'learning_rate': 1.9967460936100063e-05, 'epoch': 0.03924309214063004}
74
+ Step: 369 {'loss': 0.5211, 'grad_norm': 0.1864064633846283, 'learning_rate': 1.9967015195498692e-05, 'epoch': 0.039780668745296205}
75
+ Step: 374 {'loss': 0.5993, 'grad_norm': 0.16560548543930054, 'learning_rate': 1.9966569454897325e-05, 'epoch': 0.04031824534996237}
76
+ Step: 379 {'loss': 0.6103, 'grad_norm': 0.2273532599210739, 'learning_rate': 1.9966123714295957e-05, 'epoch': 0.040855821954628536}
77
+ Step: 384 {'loss': 0.4817, 'grad_norm': 0.140832781791687, 'learning_rate': 1.9965677973694586e-05, 'epoch': 0.0413933985592947}
78
+ Step: 389 {'loss': 0.301, 'grad_norm': 0.10496355593204498, 'learning_rate': 1.9965232233093215e-05, 'epoch': 0.04193097516396086}
79
+ Step: 394 {'loss': 0.5082, 'grad_norm': 0.29614022374153137, 'learning_rate': 1.9964786492491848e-05, 'epoch': 0.04246855176862703}
80
+ Step: 399 {'loss': 0.4294, 'grad_norm': 0.5071314573287964, 'learning_rate': 1.996434075189048e-05, 'epoch': 0.04300612837329319}
81
+ Step: 404 {'loss': 0.4803, 'grad_norm': 0.2569158375263214, 'learning_rate': 1.996389501128911e-05, 'epoch': 0.04354370497795936}
82
+ Step: 409 {'loss': 0.5818, 'grad_norm': 0.17603649199008942, 'learning_rate': 1.9963449270687742e-05, 'epoch': 0.044081281582625524}
83
+ Step: 414 {'loss': 0.4718, 'grad_norm': 0.22577953338623047, 'learning_rate': 1.9963003530086375e-05, 'epoch': 0.044618858187291686}
84
+ Step: 419 {'loss': 0.4718, 'grad_norm': 0.16803650557994843, 'learning_rate': 1.9962557789485004e-05, 'epoch': 0.045156434791957856}
85
+ Step: 424 {'loss': 0.3851, 'grad_norm': 0.2118372768163681, 'learning_rate': 1.9962112048883633e-05, 'epoch': 0.04569401139662402}
86
+ Interrupted by user
87
+ Step: 425 {'train_runtime': 6098.8217, 'train_samples_per_second': 73.198, 'train_steps_per_second': 36.601, 'train_loss': 0.4727456678107311, 'epoch': 0.04580152671755725}
88
+ 01:11:05-200655 INFO LoRA training run is completed and saved.
89
+ 01:11:05-307213 INFO Training complete, saving
90
+ 01:11:05-399211 INFO Training interrupted.
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+