fix space
This commit is contained in:
parent
3de8f3647b
commit
40b342a12e
|
@ -17,6 +17,7 @@ jobs:
|
|||
outputs:
|
||||
docker_image: ${{ steps.final.outputs.docker_image }}
|
||||
docker_devices: ${{ steps.final.outputs.docker_devices }}
|
||||
docker_volume: ${{ steps.final.outputs.docker_volume}}
|
||||
runs_on: ${{ steps.final.outputs.runs_on }}
|
||||
label: ${{ steps.final.outputs.label }}
|
||||
concurrency:
|
||||
|
@ -157,6 +158,33 @@ jobs:
|
|||
echo "runs_on=${{ env.RUNS_ON }}" >> "$GITHUB_OUTPUT"
|
||||
echo "label=${{ env.LABEL }}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
if [[ ${{ inputs.hardware }} == "rocm" ]]
|
||||
then
|
||||
echo "docker_volume=/data/cache/.cache/huggingface/hub"
|
||||
else
|
||||
echo "docker_volume=/mnt/cache" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
|
||||
prepare_integration_tests:
|
||||
runs-on: ["self-hosted", "${{ needs.build-and-push.outputs.runs_on }}", "multi-gpu"]
|
||||
needs: build-and-push
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
if: needs.build-and-push.outputs.runs_on != 'ubuntu-latest'
|
||||
container:
|
||||
image: ${{ needs.build-and-push.outputs.docker_image }}
|
||||
options: --shm-size "16gb" --ipc host -v ${{ needs.build-and-push.outputs.docker_volume }}:/data
|
||||
steps:
|
||||
- name: Clean Hugging Face cache
|
||||
run: |
|
||||
if [[ ${{ inputs.hardware }} == "rocm" ]]
|
||||
then
|
||||
python clean_cache.py
|
||||
fi
|
||||
|
||||
|
||||
integration_tests:
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.job }}-${{ needs.build-and-push.outputs.label }}-${{ github.head_ref || github.run_id }}
|
||||
|
@ -188,7 +216,6 @@ jobs:
|
|||
|
||||
- name: Run tests
|
||||
run: |
|
||||
export DOCKER_VOLUME=/mnt/cache
|
||||
export DOCKER_DEVICES=${{ needs.build-and-push.outputs.docker_devices }}
|
||||
export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
|
||||
|
|
|
@ -0,0 +1,115 @@
|
|||
import huggingface_hub
|
||||
|
||||
REQUIRED_MODELS = {
|
||||
"bigscience/bloom-560m": "main",
|
||||
"TinyLlama/TinyLlama-1.1B-Chat-v1.0": "main",
|
||||
"abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq": "main",
|
||||
"tiiuae/falcon-7b": "main",
|
||||
"TechxGenus/gemma-2b-GPTQ": "main",
|
||||
"google/gemma-2b": "main",
|
||||
"openai-community/gpt2": "main",
|
||||
"turboderp/Llama-3-8B-Instruct-exl2": "2.5bpw",
|
||||
"huggingface/llama-7b-gptq": "main",
|
||||
"neuralmagic/llama-2-7b-chat-marlin": "main",
|
||||
"huggingface/llama-7b": "main",
|
||||
"FasterDecoding/medusa-vicuna-7b-v1.3": "refs/pr/1",
|
||||
"mistralai/Mistral-7B-Instruct-v0.1": "main",
|
||||
"OpenAssistant/oasst-sft-1-pythia-12b": "main",
|
||||
"stabilityai/stablelm-tuned-alpha-3b": "main",
|
||||
"google/paligemma-3b-pt-224": "main",
|
||||
"microsoft/phi-2": "main",
|
||||
"Qwen/Qwen1.5-0.5B": "main",
|
||||
"bigcode/starcoder": "main",
|
||||
"Narsil/starcoder-gptq": "main",
|
||||
"bigcode/starcoder2-3b": "main",
|
||||
"HuggingFaceM4/idefics-9b-instruct": "main",
|
||||
"HuggingFaceM4/idefics2-8b": "main",
|
||||
"llava-hf/llava-v1.6-mistral-7b-hf": "main",
|
||||
"state-spaces/mamba-130m": "main",
|
||||
"mosaicml/mpt-7b": "main",
|
||||
"bigscience/mt0-base": "main",
|
||||
"google/flan-t5-xxl": "main",
|
||||
}
|
||||
|
||||
|
||||
def cleanup_cache():
|
||||
# Retrieve the size per model for all models used in the CI.
|
||||
size_per_model = {}
|
||||
for model_id, revision in REQUIRED_MODELS.items():
|
||||
model_size = 0
|
||||
all_files = huggingface_hub.list_repo_files(
|
||||
model_id,
|
||||
repo_type="model",
|
||||
revision=revision,
|
||||
token=token,
|
||||
)
|
||||
|
||||
extension = None
|
||||
if any(".safetensors" in filename for filename in all_files):
|
||||
extension = ".safetensors"
|
||||
elif any(".pt" in filename for filename in all_files):
|
||||
extension = ".pt"
|
||||
elif any(".bin" in filename for filename in all_files):
|
||||
extension = ".bin"
|
||||
|
||||
for filename in all_files:
|
||||
if filename.endswith(extension):
|
||||
file_url = huggingface_hub.hf_hub_url(
|
||||
model_id, filename, revision=revision
|
||||
)
|
||||
file_metadata = huggingface_hub.get_hf_file_metadata(
|
||||
file_url, token=token
|
||||
)
|
||||
model_size += file_metadata.size * 1e-9 # in GB
|
||||
|
||||
size_per_model[model_id] = model_size
|
||||
|
||||
cached_dir = huggingface_hub.scan_cache_dir()
|
||||
|
||||
cache_size_per_model = {}
|
||||
cached_required_size_per_model = {}
|
||||
cached_shas_per_model = {}
|
||||
|
||||
# Retrieve the SHAs and model ids of other non-necessary models in the cache.
|
||||
for repo in cached_dir.repos:
|
||||
if repo.repo_id in REQUIRED_MODELS:
|
||||
cached_required_size_per_model[repo.repo_id] = (
|
||||
repo.size_on_disk * 1e-9
|
||||
) # in GB
|
||||
elif repo.repo_type == "model":
|
||||
cache_size_per_model[repo.repo_id] = repo.size_on_disk * 1e-9 # in GB
|
||||
|
||||
shas = []
|
||||
for _, ref in repo.refs.items():
|
||||
shas.append(ref.commit_hash)
|
||||
cached_shas_per_model[repo.repo_id] = shas
|
||||
|
||||
total_required_cached_size = sum(cached_required_size_per_model.values())
|
||||
total_other_cached_size = sum(cache_size_per_model.values())
|
||||
total_required_size = sum(size_per_model.values())
|
||||
|
||||
total_non_cached_required_size = total_required_size - total_required_cached_size
|
||||
|
||||
free_memory = shutil.disk_usage("/data").free * 1e-9
|
||||
if free_memory + total_other_cached_size < total_non_cached_required_size * 1.05:
|
||||
raise ValueError(
|
||||
"Not enough space on device to execute the complete CI, please clean up the CI machine"
|
||||
)
|
||||
|
||||
while free_memory < total_non_cached_required_size * 1.05:
|
||||
if len(cache_size_per_model) == 0:
|
||||
raise ValueError("This should not happen.")
|
||||
|
||||
largest_model_id = max(cache_size_per_model, key=cache_size_per_model.get)
|
||||
|
||||
print("Removing", largest_model_id)
|
||||
for sha in cached_shas_per_model[largest_model_id]:
|
||||
huggingface_hub.scan_cache_dir().delete_revisions(sha).execute()
|
||||
|
||||
del cache_size_per_model[largest_model_id]
|
||||
|
||||
free_memory = shutil.disk_usage("/data").free * 1e-9
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cleanup_cache()
|
Loading…
Reference in New Issue