from airflow import DAG from airflow.models import Param from airflow.kubernetes.secret import Secret from kubernetes.client import models as k8s from smartflow.operators.pod import create_pod_task from datetime import datetime from textwrap import dedent with DAG( dag_id="KIT_TRAIN_NATIVE_BAS", description="Kitware demo: creates toy data, fits a model to it, makes predictions, and evaluates them.", params={ "smart_version_tag": Param(default="main", type="string"), }, catchup=False, schedule_interval=None, max_active_runs=1, default_view="grid", tags=["watch", "training"], start_date=datetime(2022, 3, 1), ) as dag: DVC_DATA_DPATH = "/efs/work/greenwell/data/Aligned-Drop4-2022-08-08-TA1-S2-L8-ACC" DVC_EXPT_DPATH = "/efs/work/greenwell/data/smart_expt_dvc" WORKDIR = f"{DVC_EXPT_DPATH}/training/smartflow/airflow_root" # Generate toy datasets TRAIN_FPATH = f"{DVC_DATA_DPATH}/data_train.kwcoco.json" VALI_FPATH = f"{DVC_DATA_DPATH}/data_vali.kwcoco.json" TEST_FPATH = f"{DVC_DATA_DPATH}/data_vali.kwcoco.json" EXPERIMENT_NAME = "Drop4-BAS_Heterogeneous" DATASET_CODE = "Drop4-BAS" DEFAULT_ROOT_DIR = f"{WORKDIR}/{DATASET_CODE}/runs/{EXPERIMENT_NAME}" """ Training, Prediction, and Evaluation ------------------------------------ Now that we are more comfortable with kwcoco files, lets get into the simplest and most direct way of training a fusion model. This is done by simply calling 'watch.tasks.fusion.fit' as the main module. We will specify: * paths to the training and validation kwcoco files * what channels we want to early / late fuse (given by a kwcoco sensorchan spec) * information about the input chip size and temporal window * the underlying architecture * other deep learning hyperparameters In this tutorial we will use 'cpu' as our lightning accelerator. If you have an available gpu and want to use it, change this to 'gpu' and add the argument '--devices=0,' We will also specify a work directory that will be similar to directories used when real watch models are trained. """ train_model = create_pod_task( task_id="train_model", image="registry.smartgitlab.com/kitware/watch/ta2:Oct31-debug11", secrets=[ Secret('env', 'WATCH_GITLAB_USERNAME', 'watch-gitlab-repo', 'username'), Secret('env', 'WATCH_GITLAB_PASSWORD', 'watch-gitlab-repo', 'password'), ], env_vars=[ k8s.V1EnvVar(name="EXPERIMENT_NAME", value=EXPERIMENT_NAME), k8s.V1EnvVar(name="DEFAULT_ROOT_DIR", value=DEFAULT_ROOT_DIR), k8s.V1EnvVar(name="TRAIN_FPATH", value=TRAIN_FPATH), k8s.V1EnvVar(name="VALI_FPATH", value=VALI_FPATH), ], cmds=["bash", "-exc"], arguments=[ dedent( """ ls /efs/work/greenwell echo "=======================" ls /efs/work/greenwell/data echo "=======================" ls /efs/work/greenwell/data/Aligned-Drop4-2022-08-08-TA1-S2-L8-ACC ############################################## # Setup environment and codebase ############################################## mkdir -p /root/code git clone https://$WATCH_GITLAB_USERNAME:$WATCH_GITLAB_PASSWORD@gitlab.kitware.com/smart/watch.git /root/code/watch cd /root/code/watch git remote update; git checkout {{ params.smart_version_tag }} # source run_developer_setup.sh pip install -r requirements.txt -v pip install -r requirements/gdal.txt # pip install -r requirements/headless.txt pip install -e . pip install -U delayed-image python -c "import delayed_image; print('delayed_image.version = ', delayed_image.__version__)" ############################################## # Train the model ############################################## python -m geowatch.tasks.fusion fit \ --trainer.default_root_dir="$DEFAULT_ROOT_DIR" \ --data.train_dataset="$TRAIN_FPATH" \ --data.vali_dataset="$VALI_FPATH" \ --data.channels="red|green|blue|nir" \ --data.window_space_scale="10GSD" \ --data.input_space_scale="native" \ --data.output_space_scale="30GSD" \ --data.time_steps=3 \ --data.chip_size=128 \ --data.batch_size=64 \ --data.num_workers=30 \ --model=watch.tasks.fusion.methods.HeterogeneousModel \ --model.name="$EXPERIMENT_NAME" \ --model.token_width=16 \ --model.token_dim=32 \ --model.position_encoder=watch.tasks.fusion.methods.heterogeneous.MipNerfPositionalEncoder \ --model.position_encoder.in_dims=3 \ --model.position_encoder.max_freq=3 \ --model.position_encoder.num_freqs=16 \ --model.backbone=watch.tasks.fusion.architectures.transformer.TransformerEncoderDecoder \ --model.backbone.encoder_depth=6 \ --model.backbone.decoder_depth=1 \ --model.backbone.dim=128 \ --model.backbone.queries_dim=96 \ --model.backbone.logits_dim=32 \ --optimizer=torch.optim.AdamW \ --optimizer.lr=1e-3 \ --optimizer.weight_decay=1e-3 \ --trainer.max_steps=1000000 \ --trainer.accelerator="gpu" \ --trainer.precision=16 \ --trainer.devices="0," geowatch torch_model_stats "$DEFAULT_ROOT_DIR"/final_package.pt --stem_stats=True """ ) ], purpose="gpu-nvidia-t4-c32-m128-g1-od", # TODO: choose multi-gpu node cpu_limit="31", memory_limit="120G", mount_dshm=True, mount_efs_work=True, ) """ Now that we have an understanding of what metadata the model contains, we can start to appreciate the dead simplicity of predicting with it. To use a model to predict on an unseed kwcoco dataset (in this case the toy test set) we simply call the "watch.tasks.fusion.predict" script and pass it: * the kwcoco file of the dataset to predict on * the path to the model we want to predict with * the name of the output kwcoco file that will contain the predictions All necessary metadata you would normally have to (redundantly) specify in other frameworks is inferred by programmatically reading the model. You also have the option to overwrite prediction parameters. See --help for details, but for now lets just run with the defaults that match how the model was trained. Note that the test dataset contains groundtruth annotations. All annotations are stripped and ignored during prediction. The output of the predictions is just another kwcoco file, but it augments the input images with new channels corresponding to predicted heatmaps. We can use the "geowatch stats" command to inspect what these new channels are. """ predict = create_pod_task( task_id="predict", image="registry.smartgitlab.com/kitware/watch/ta2:Oct31-debug11", cmds=["bash", "-exc"], env_vars=[ k8s.V1EnvVar(name="DVC_EXPT_DPATH", value=DVC_EXPT_DPATH), k8s.V1EnvVar(name="DEFAULT_ROOT_DIR", value=DEFAULT_ROOT_DIR), k8s.V1EnvVar(name="TEST_FPATH", value=TEST_FPATH), ], arguments=[ dedent( """ # TODO: build docker image with these steps already done cd /watch pip install -r requirements/development.txt pip install -r requirements/runtime.txt pip install -r requirements/optional.txt pip install -r requirements/gdal.txt pip install -e . python -m geowatch.tasks.fusion.predict \ --channels="red|green|blue|nir" \ --test_dataset="$TEST_FPATH" \ --package_fpath="$DEFAULT_ROOT_DIR"/final_package.pt \ --pred_dataset="$DVC_EXPT_DPATH"/predictions/pred.kwcoco.json geowatch stats "$DVC_EXPT_DPATH"/predictions/pred.kwcoco.json """ ) ], purpose="gpu-nvidia-t4-c32-m128-g1-od", # TODO: choose single-gpu node cpu_limit="15", memory_limit="28G", mount_dshm=True, mount_efs_work=True, ) """ The last step in this basic tutorial is to measure how good our model is. We can do this with pixelwise metrics. This is done by using "watch.tasks.fusion.evaluate" as the main module, and its arguments are: * The true kwcoco data with groundtruth annotations (i.e. the test dataset) * The pred kwcoco data that we predicted earlier * An output path for results """ eval_model = create_pod_task( task_id="eval_model", image="registry.smartgitlab.com/kitware/watch/ta2:Oct31-debug11", cmds=["bash", "-exc"], env_vars=[ k8s.V1EnvVar(name="DVC_EXPT_DPATH", value=DVC_EXPT_DPATH), k8s.V1EnvVar(name="TEST_FPATH", value=TEST_FPATH), ], arguments=[ dedent( """ # TODO: build docker image with these steps already done cd /watch pip install -r requirements/development.txt pip install -r requirements/runtime.txt pip install -r requirements/optional.txt pip install -r requirements/gdal.txt pip install -e . python -m geowatch.tasks.fusion.evaluate \ --true_dataset="$TEST_FPATH" \ --pred_dataset="$DVC_EXPT_DPATH"/predictions/pred.kwcoco.json \ --eval_dpath="$DVC_EXPT_DPATH"/predictions/eval """ ) ], purpose="gpu-nvidia-t4-c32-m128-g1-od", cpu_limit="15", memory_limit="28G", mount_dshm=True, mount_efs_work=True, ) train_model >> predict >> eval_model