ghidav
commited on
Commit
·
d8cfc60
1
Parent(s):
d1b49ae
Add SAEs
Browse files- L0/.DS_Store +0 -0
- L0/1B/cfg.json +1 -0
- L0/1B/sae_weights.safetensors +3 -0
- L0/1B/sparsity.safetensors +3 -0
- L1/.DS_Store +0 -0
- L1/cfg.json +1 -0
- L1/sae_weights.safetensors +3 -0
- L1/sparsity.safetensors +3 -0
- L10/.DS_Store +0 -0
- L10/cfg.json +1 -0
- L10/sae_weights.safetensors +3 -0
- L10/sparsity.safetensors +3 -0
- L11/cfg.json +1 -0
- L11/sae_weights.safetensors +3 -0
- L11/sparsity.safetensors +3 -0
- L2/.DS_Store +0 -0
- L2/cfg.json +1 -0
- L2/sae_weights.safetensors +3 -0
- L2/sparsity.safetensors +3 -0
- L3/cfg.json +1 -0
- L4/.DS_Store +0 -0
- L4/cfg.json +1 -0
- L4/sae_weights.safetensors +3 -0
- L4/sparsity.safetensors +3 -0
- L5/cfg.json +1 -0
- L5/sae_weights.safetensors +3 -0
- L5/sparsity.safetensors +3 -0
- L6/.DS_Store +0 -0
- L6/cfg.json +1 -0
- L6/sae_weights.safetensors +3 -0
- L6/sparsity.safetensors +3 -0
- L7/cfg.json +1 -0
- L7/sae_weights.safetensors +3 -0
- L7/sparsity.safetensors +3 -0
- L8/.DS_Store +0 -0
- L8/cfg.json +1 -0
- L8/sae_weights.safetensors +3 -0
- L8/sparsity.safetensors +3 -0
- L9/cfg.json +1 -0
- L9/sae_weights.safetensors +3 -0
- L9/sparsity.safetensors +3 -0
L0/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
L0/1B/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.0.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 0, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L0_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/0gs7wqs5", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L0/1B/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6acb7f40a9c8dd71edd3742ee221103ede3afb872a06b51ecd4560c1171e1502
|
3 |
+
size 37801344
|
L0/1B/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e93cee41bb74dca9fd92ed2659b273a82d61bad27333b9929b8d99493e4b6d10
|
3 |
+
size 24656
|
L1/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
L1/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.1.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 1, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L1_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/eezx69qa", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L1/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:113867ac12ca66845b57f406a33b7647b4243dcab468b4abad198843f76224c6
|
3 |
+
size 37801344
|
L1/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:174b789045f37a36a15dc10cfc160380a4e7422be85e7b46ddea8099fad4b1fc
|
3 |
+
size 24656
|
L10/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
L10/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.10.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 10, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L10_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/ty4tmelj", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L10/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ca3c7f84228d86a3243162744b2c7d3fbeec208eb0c6fbdba80157c94d8d522d
|
3 |
+
size 37801344
|
L10/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f819e1d5dab7f01e71151ae043b0bba84d1466cf8ca7246f613236a45e6ed3c
|
3 |
+
size 24656
|
L11/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.11.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 11, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": false, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L11_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/wo5jtn8p", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L11/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9a1b57a0c720fde27f3e9b67c9de5e42ad6c3d5a6fabec347bff5edbad3989bf
|
3 |
+
size 37801344
|
L11/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:988c05816550e4f536eaa583a039d0d480ac1b007bb55779e77aff08b5cf247c
|
3 |
+
size 24656
|
L2/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
L2/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.2.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 2, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L2_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/74k12zkd", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L2/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a1a1f78127fd5ea8761c635166d599b726332bff51e9aa4059e1eb4d34710906
|
3 |
+
size 37801344
|
L2/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:32b964e6227a44ebab212053051675cd85103082466548731032c33ab91cc482
|
3 |
+
size 24656
|
L3/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.3.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 3, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": false, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L3_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/tpr9k67q", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L4/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
L4/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.4.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 4, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L4_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/9hlnqk5i", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L4/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:80747516ff91d84590f9f34d7bc8de6f3dd314fca1b763499ac74db90ad76045
|
3 |
+
size 37801344
|
L4/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1d62798cba6069bc0942c8cfeae560083463b3c9bf8bd675b8a32ee56ee2dc01
|
3 |
+
size 24656
|
L5/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.5.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 5, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L5_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/99xxpw0a", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L5/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6974c7bd8502c276ad9b3363f9b4f3d2d9368a50878a9e77853562f0b620becb
|
3 |
+
size 37801344
|
L5/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a9e1bf4d35ccb0f088f422226609090061dc73c3488c1ab6a2a5c9c9f17139c1
|
3 |
+
size 24656
|
L6/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
L6/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.6.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 6, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L6_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/4bppuj63", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L6/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a793ded70ec91c54449df4a1519fcc0f641372f6dbd94faeb077759ed2dc0a6
|
3 |
+
size 37801344
|
L6/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c2b8476f4468db77d4c02adc6fe2a90dca09d07630ecc9e9df78d3794fe59b7f
|
3 |
+
size 24656
|
L7/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.7.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 7, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L7_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/u96c04u1", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L7/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ba80145dd8ef8e6ceece31bbe6ede4942e000fc303b57c29ad075425cccdd39a
|
3 |
+
size 37801344
|
L7/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b11c16beca5275de29d30242969be458eb60e55f8cc205ceaf244206579cdcf8
|
3 |
+
size 24656
|
L8/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
L8/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.8.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 8, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L8_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/bet5fk1u", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L8/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d961b088585640ff8e344424dd5d60b8790336c4b575347b8c5e7d3b662937aa
|
3 |
+
size 37801344
|
L8/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:51cbe8a4dea526ae66d3c6fe5d3b155a998981a6b964230b6b83be327b0e157b
|
3 |
+
size 24656
|
L9/cfg.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"model_name": "pythia-160m-deduped", "model_class_name": "HookedTransformer", "hook_name": "blocks.9.hook_resid_post", "hook_eval": "NOT_IN_USE", "hook_layer": 9, "hook_head_index": null, "dataset_path": "NeelNanda/pile-small-tokenized-2b", "dataset_trust_remote_code": true, "streaming": true, "is_dataset_tokenized": true, "context_size": 1024, "use_cached_activations": false, "cached_activations_path": null, "architecture": "jumprelu", "d_in": 768, "d_sae": 6144, "b_dec_init_method": "zeros", "expansion_factor": 8, "activation_fn": "relu", "activation_fn_kwargs": {}, "normalize_sae_decoder": true, "noise_scale": 0.0, "from_pretrained_path": null, "apply_b_dec_to_input": false, "decoder_orthogonal_init": false, "decoder_heuristic_init": false, "init_encoder_as_decoder_transpose": false, "n_batches_in_buffer": 128, "training_tokens": 1000000000, "finetuning_tokens": 0, "store_batch_size_prompts": 8, "train_batch_size_tokens": 4096, "normalize_activations": "none", "device": "cuda", "act_store_device": "cuda", "seed": 42, "dtype": "float32", "prepend_bos": false, "autocast": false, "autocast_lm": false, "compile_llm": false, "llm_compilation_mode": null, "compile_sae": false, "sae_compilation_mode": null, "adam_beta1": 0, "adam_beta2": 0.999, "mse_loss_normalization": null, "l1_coefficient": 1.0, "lp_norm": 1, "scale_sparsity_penalty_by_decoder_norm": false, "l1_warm_up_steps": 12207, "lr": 3e-05, "lr_scheduler_name": "constant", "lr_warm_up_steps": 0, "lr_end": 3e-06, "lr_decay_steps": 48828, "n_restart_cycles": 1, "finetuning_method": null, "use_ghost_grads": false, "feature_sampling_window": 2000, "dead_feature_window": 1000, "dead_feature_threshold": 1e-06, "n_eval_batches": 10, "eval_batch_size_prompts": null, "log_to_wandb": true, "log_activations_store_to_wandb": false, "log_optimizer_state_to_wandb": false, "wandb_project": "sae-transfer-learning", "wandb_id": null, "run_name": "L9_hook_resid_post_L1_1_0", "wandb_entity": null, "wandb_log_frequency": 30, "eval_every_n_wandb_logs": 100, "resume": false, "n_checkpoints": 10, "checkpoint_path": "checkpoints/7p058t1g", "verbose": true, "model_kwargs": {}, "model_from_pretrained_kwargs": {}, "sae_lens_version": "3.13.1", "sae_lens_training_version": "3.13.1", "tokens_per_buffer": 536870912}
|
L9/sae_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d0b835f51e77fb5983e7682227e0f6e186f2bf078f2e1ac7d7c22f551231cb01
|
3 |
+
size 37801344
|
L9/sparsity.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b107e64c49f78b9eff6f371be4579effe7b6c9fd56cf33eadff9c96fd6ecc98
|
3 |
+
size 24656
|