nsthorat-lilac commited on
Commit
e5fe28b
·
1 Parent(s): fe2e46f

Upload docker_start.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. docker_start.py +116 -0
docker_start.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Startup work before running the web server."""
2
+
3
+ import os
4
+ import shutil
5
+ from typing import TypedDict
6
+
7
+ import yaml
8
+ from huggingface_hub import scan_cache_dir, snapshot_download
9
+
10
+ from lilac.concepts.db_concept import CONCEPTS_DIR, DiskConceptDB, get_concept_output_dir
11
+ from lilac.env import env, get_project_dir
12
+ from lilac.project import PROJECT_CONFIG_FILENAME
13
+ from lilac.utils import get_datasets_dir, get_lilac_cache_dir, log
14
+
15
+
16
+ def delete_old_files() -> None:
17
+ """Delete old files from the cache."""
18
+ # Scan cache
19
+ try:
20
+ scan = scan_cache_dir()
21
+ except BaseException:
22
+ # Cache was not found.
23
+ return
24
+
25
+ # Select revisions to delete
26
+ to_delete = []
27
+ for repo in scan.repos:
28
+ latest_revision = max(repo.revisions, key=lambda x: x.last_modified)
29
+ to_delete.extend(
30
+ [revision.commit_hash for revision in repo.revisions if revision != latest_revision])
31
+ strategy = scan.delete_revisions(*to_delete)
32
+
33
+ # Delete them
34
+ log(f'Will delete {len(to_delete)} old revisions and save {strategy.expected_freed_size_str}')
35
+ strategy.execute()
36
+
37
+
38
+ class HfSpaceConfig(TypedDict):
39
+ """The huggingface space config, defined in README.md.
40
+
41
+ See:
42
+ https://huggingface.co/docs/hub/spaces-config-reference
43
+ """
44
+ title: str
45
+ datasets: list[str]
46
+
47
+
48
+ def main() -> None:
49
+ """Download dataset files from the HF space that was uploaded before building the image."""
50
+ # SPACE_ID is the HuggingFace Space ID environment variable that is automatically set by HF.
51
+ repo_id = env('SPACE_ID', None)
52
+ if not repo_id:
53
+ return
54
+
55
+ delete_old_files()
56
+
57
+ with open(os.path.abspath('README.md')) as f:
58
+ # Strip the '---' for the huggingface readme config.
59
+ readme = f.read().strip().strip('---')
60
+ hf_config: HfSpaceConfig = yaml.safe_load(readme)
61
+
62
+ # Download the huggingface space data. This includes code and datasets, so we move the datasets
63
+ # alone to the data directory.
64
+
65
+ datasets_dir = get_datasets_dir(get_project_dir())
66
+ os.makedirs(datasets_dir, exist_ok=True)
67
+ for lilac_hf_dataset in hf_config['datasets']:
68
+ print('Downloading dataset from HuggingFace: ', lilac_hf_dataset)
69
+ snapshot_download(
70
+ repo_id=lilac_hf_dataset,
71
+ repo_type='dataset',
72
+ token=env('HF_ACCESS_TOKEN'),
73
+ local_dir=datasets_dir,
74
+ ignore_patterns=['.gitattributes', 'README.md'])
75
+
76
+ snapshot_dir = snapshot_download(repo_id=repo_id, repo_type='space', token=env('HF_ACCESS_TOKEN'))
77
+
78
+ spaces_data_dir = os.path.join(snapshot_dir, 'data')
79
+ # Copy the config file.
80
+ project_config_file = os.path.join(spaces_data_dir, PROJECT_CONFIG_FILENAME)
81
+ if os.path.exists(project_config_file):
82
+ shutil.copy(project_config_file, os.path.join(get_project_dir(), PROJECT_CONFIG_FILENAME))
83
+
84
+ # Delete cache files from persistent storage.
85
+ cache_dir = get_lilac_cache_dir(get_project_dir())
86
+ if os.path.exists(cache_dir):
87
+ shutil.rmtree(cache_dir)
88
+
89
+ # NOTE: This is temporary during the move of concepts into the pip package. Once all the demos
90
+ # have been updated, this block can be deleted.
91
+ old_lilac_concepts_data_dir = os.path.join(get_project_dir(), CONCEPTS_DIR, 'lilac')
92
+ if os.path.exists(old_lilac_concepts_data_dir):
93
+ shutil.rmtree(old_lilac_concepts_data_dir)
94
+
95
+ # Copy cache files from the space if they exist.
96
+ spaces_cache_dir = get_lilac_cache_dir(spaces_data_dir)
97
+ if os.path.exists(spaces_cache_dir):
98
+ shutil.copytree(spaces_cache_dir, cache_dir)
99
+
100
+ # Copy concepts.
101
+ concepts = DiskConceptDB(spaces_data_dir).list()
102
+ for concept in concepts:
103
+ # Ignore lilac concepts, they're already part of the source code.
104
+ if concept.namespace == 'lilac':
105
+ continue
106
+ spaces_concept_output_dir = get_concept_output_dir(spaces_data_dir, concept.namespace,
107
+ concept.name)
108
+ persistent_output_dir = get_concept_output_dir(get_project_dir(), concept.namespace,
109
+ concept.name)
110
+ shutil.rmtree(persistent_output_dir, ignore_errors=True)
111
+ shutil.copytree(spaces_concept_output_dir, persistent_output_dir, dirs_exist_ok=True)
112
+ shutil.rmtree(spaces_concept_output_dir, ignore_errors=True)
113
+
114
+
115
+ if __name__ == '__main__':
116
+ main()