Forked from zhusg / transformers-new
9351 commits behind, 11 commits ahead of the upstream repository.
config.yml 42.00 KiB
version: 2.1
orbs:
    gcp-gke: circleci/gcp-gke@1.0.4
    go: circleci/go@1.3.0
# TPU REFERENCES
references:
    checkout_ml_testing: &checkout_ml_testing
        run:
            name: Checkout ml-testing-accelerators
            command: |
                git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
                cd ml-testing-accelerators
                git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
                git checkout stable
    build_push_docker: &build_push_docker
        run:
            name: Configure Docker
            command: |
                gcloud --quiet auth configure-docker
                cd docker/transformers-pytorch-tpu
                if [ -z "$CIRCLE_PR_NUMBER" ]; then docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" . ; else docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" --build-arg "GITHUB_REF=pull/$CIRCLE_PR_NUMBER/head" . ; fi
                docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID"
    deploy_cluster: &deploy_cluster
        run:
            name: Deploy the job on the kubernetes cluster
            command: |
                go get github.com/google/go-jsonnet/cmd/jsonnet && \
                export PATH=$PATH:$HOME/go/bin && \
                kubectl create -f docker/transformers-pytorch-tpu/dataset.yaml || true && \
                job_name=$(jsonnet -J ml-testing-accelerators/ docker/transformers-pytorch-tpu/bert-base-cased.jsonnet --ext-str image=$GCR_IMAGE_PATH --ext-str image-tag=$CIRCLE_WORKFLOW_JOB_ID | kubectl create -f -) && \
                job_name=${job_name#job.batch/} && \
                job_name=${job_name% created} && \
                echo "Waiting on kubernetes job: $job_name" && \
                i=0 && \
                # 30 checks spaced 30s apart = 900s total.
                max_checks=30 && \
                status_code=2 && \
                # Check on the job periodically. Set the status code depending on what
                # happened to the job in Kubernetes. If we try max_checks times and
                # still the job hasn't finished, give up and return the starting
                # non-zero status code.
                while [ $i -lt $max_checks ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \
                echo "Done waiting. Job status code: $status_code" && \
                pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \
                echo "GKE pod name: $pod_name" && \
                kubectl logs -f $pod_name --container=train
                echo "Done with log retrieval attempt." && \
                gcloud container images delete "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" --force-delete-tags && \
                exit $status_code
    delete_gke_jobs: &delete_gke_jobs
        run:
            name: Delete GKE Jobs
            command: |
                # Match jobs whose age matches patterns like '1h' or '1d', i.e. any job
                # that has been around longer than 1hr. First print all columns for
                # matches, then execute the delete.
                kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $0}'
                kubectl delete job $(kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $1}')
jobs:
    run_tests_torch_and_tf:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.7
        environment:
            OMP_NUM_THREADS: 1
7172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
RUN_PT_TF_CROSS_TESTS: yes TRANSFORMERS_IS_CI: yes resource_class: xlarge parallelism: 1 steps: - checkout - restore_cache: keys: - v0.4-torch_and_tf-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }} - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng - run: pip install --upgrade pip - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision] - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html - run: pip install tensorflow_probability - run: pip install https://github.com/kpu/kenlm/archive/master.zip - run: pip install git+https://github.com/huggingface/accelerate - save_cache: key: v0.4-{{ checksum "setup.py" }} paths: - '~/.cache/pip' - run: python utils/tests_fetcher.py | tee test_preparation.txt - store_artifacts: path: ~/transformers/test_preparation.txt - run: | if [ -f test_list.txt ]; then python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf $(cat test_list.txt) -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt fi - store_artifacts: path: ~/transformers/tests_output.txt - store_artifacts: path: ~/transformers/reports run_tests_torch_and_tf_all: working_directory: ~/transformers docker: - image: circleci/python:3.7 environment: OMP_NUM_THREADS: 1 RUN_PT_TF_CROSS_TESTS: yes TRANSFORMERS_IS_CI: yes resource_class: xlarge parallelism: 1 steps: - checkout - restore_cache: keys: - v0.4-torch_and_tf-{{ checksum "setup.py" }} - v0.4-{{ checksum "setup.py" }} - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng - run: pip install --upgrade pip - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision] - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.11.0+cpu.html - run: pip install tensorflow_probability - run: pip install https://github.com/kpu/kenlm/archive/master.zip - run: pip install git+https://github.com/huggingface/accelerate - save_cache: key: v0.4-{{ checksum "setup.py" }} paths: - '~/.cache/pip' - run: | python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf tests -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt - store_artifacts: path: ~/transformers/tests_output.txt - store_artifacts: path: ~/transformers/reports run_tests_torch_and_flax: working_directory: ~/transformers docker: