Profiler v2 (#1089)

Co-authored-by: RdoubleA <[email protected]>
pytorch · Jun 26, 2024 · 52e3283 · 52e3283
1 parent 15c918d
commit 52e3283
Show file tree

Hide file tree

Showing 22 changed files with 1,116 additions and 77 deletions.
diff --git a/docs/source/api_ref_utilities.rst b/docs/source/api_ref_utilities.rst
@@ -87,7 +87,7 @@ of your finetuning job.
 
  get_memory_stats
  log_memory_stats
- profiler
+ setup_torch_profiler
 
 .. _metric_logging_label:
 

diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml
@@ -83,6 +83,25 @@ log_every_n_steps: 1
 log_peak_memory_stats: False
 
 profiler:
- _component_: torchtune.utils.profiler
+ _component_: torchtune.utils.setup_torch_profiler
  enabled: False
- output_dir: ${output_dir}/torchtune_perf_tracing.json
+
+ #Output directory of trace artifacts
+ output_dir: ${output_dir}/profiling_outputs
+
+ #`torch.profiler.ProfilerActivity` types to trace
+ cpu: True
+ cuda: True
+
+ #trace options passed to `torch.profiler.profile`
+ profile_memory: False
+ with_stack: False
+ record_shapes: True
+ with_flops: False
+
+ # `torch.profiler.schedule` options:
+ # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+ wait_steps: 5
+ warmup_steps: 5
+ active_steps: 2
+ num_cycles: 1
diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml
@@ -86,6 +86,25 @@ log_peak_memory_stats: False
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
 profiler:
- _component_: torchtune.utils.profiler
+ _component_: torchtune.utils.setup_torch_profiler
  enabled: False
- output_dir: ${output_dir}/torchtune_perf_tracing.json
+
+ #Output directory of trace artifacts
+ output_dir: ${output_dir}/profiling_outputs
+
+ #`torch.profiler.ProfilerActivity` types to trace
+ cpu: True
+ cuda: True
+
+ #trace options passed to `torch.profiler.profile`
+ profile_memory: False
+ with_stack: False
+ record_shapes: True
+ with_flops: False
+
+ # `torch.profiler.schedule` options:
+ # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+ wait_steps: 5
+ warmup_steps: 5
+ active_steps: 2
+ num_cycles: 1
diff --git a/recipes/configs/gemma/2B_lora_single_device.yaml b/recipes/configs/gemma/2B_lora_single_device.yaml
@@ -84,6 +84,25 @@ log_peak_memory_stats: False
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
 profiler:
- _component_: torchtune.utils.profiler
+ _component_: torchtune.utils.setup_torch_profiler
  enabled: False
- output_dir: /tmp/alpaca-gemma-finetune/torchtune_perf_tracing.json
+
+ #Output directory of trace artifacts
+ output_dir: ${output_dir}/profiling_outputs
+
+ #`torch.profiler.ProfilerActivity` types to trace
+ cpu: True
+ cuda: True
+
+ #trace options passed to `torch.profiler.profile`
+ profile_memory: False
+ with_stack: False
+ record_shapes: True
+ with_flops: False
+
+ # `torch.profiler.schedule` options:
+ # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+ wait_steps: 5
+ warmup_steps: 5
+ active_steps: 2
+ num_cycles: 1
diff --git a/recipes/configs/gemma/2B_qlora_single_device.yaml b/recipes/configs/gemma/2B_qlora_single_device.yaml
@@ -84,6 +84,25 @@ log_peak_memory_stats: False
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
 profiler:
- _component_: torchtune.utils.profiler
+ _component_: torchtune.utils.setup_torch_profiler
  enabled: False
- output_dir: /tmp/alpaca-gemma-finetune/torchtune_perf_tracing.json
+
+ #Output directory of trace artifacts
+ output_dir: ${output_dir}/profiling_outputs
+
+ #`torch.profiler.ProfilerActivity` types to trace
+ cpu: True
+ cuda: True
+
+ #trace options passed to `torch.profiler.profile`
+ profile_memory: False
+ with_stack: False
+ record_shapes: True
+ with_flops: False
+
+ # `torch.profiler.schedule` options:
+ # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+ wait_steps: 5
+ warmup_steps: 5
+ active_steps: 2
+ num_cycles: 1
diff --git a/recipes/configs/gemma/7B_lora_single_device.yaml b/recipes/configs/gemma/7B_lora_single_device.yaml
@@ -86,6 +86,25 @@ log_peak_memory_stats: False
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
 profiler:
- _component_: torchtune.utils.profiler
+ _component_: torchtune.utils.setup_torch_profiler
  enabled: False
- output_dir: /tmp/alpaca-gemma-finetune/torchtune_perf_tracing.json
+
+ #Output directory of trace artifacts
+ output_dir: ${output_dir}/profiling_outputs
+
+ #`torch.profiler.ProfilerActivity` types to trace
+ cpu: True
+ cuda: True
+
+ #trace options passed to `torch.profiler.profile`
+ profile_memory: False
+ with_stack: False
+ record_shapes: True
+ with_flops: False
+
+ # `torch.profiler.schedule` options:
+ # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+ wait_steps: 5
+ warmup_steps: 5
+ active_steps: 2
+ num_cycles: 1
diff --git a/recipes/configs/gemma/7B_qlora_single_device.yaml b/recipes/configs/gemma/7B_qlora_single_device.yaml
@@ -86,6 +86,25 @@ log_peak_memory_stats: False
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
 profiler:
- _component_: torchtune.utils.profiler
+ _component_: torchtune.utils.setup_torch_profiler
  enabled: False
- output_dir: /tmp/alpaca-gemma-finetune/torchtune_perf_tracing.json
+
+ #Output directory of trace artifacts
+ output_dir: ${output_dir}/profiling_outputs
+
+ #`torch.profiler.ProfilerActivity` types to trace
+ cpu: True
+ cuda: True
+
+ #trace options passed to `torch.profiler.profile`
+ profile_memory: False
+ with_stack: False
+ record_shapes: True
+ with_flops: False
+
+ # `torch.profiler.schedule` options:
+ # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+ wait_steps: 5
+ warmup_steps: 5
+ active_steps: 2
+ num_cycles: 1
diff --git a/recipes/configs/llama2/13B_qlora_single_device.yaml b/recipes/configs/llama2/13B_qlora_single_device.yaml
@@ -83,6 +83,25 @@ enable_activation_checkpointing: True
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
 profiler:
- _component_: torchtune.utils.profiler
+ _component_: torchtune.utils.setup_torch_profiler
  enabled: False
- output_dir: ${output_dir}/torchtune_perf_tracing.json
+
+ #Output directory of trace artifacts
+ output_dir: ${output_dir}/profiling_outputs
+
+ #`torch.profiler.ProfilerActivity` types to trace
+ cpu: True
+ cuda: True
+
+ #trace options passed to `torch.profiler.profile`
+ profile_memory: False
+ with_stack: False
+ record_shapes: True
+ with_flops: False
+
+ # `torch.profiler.schedule` options:
+ # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+ wait_steps: 5
+ warmup_steps: 5
+ active_steps: 2
+ num_cycles: 1
diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml
@@ -80,3 +80,30 @@ log_peak_memory_stats: False
 device: cuda
 dtype: bf16
 enable_activation_checkpointing: False
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+ _component_: torchtune.utils.setup_torch_profiler
+
+ enabled: False
+
+ #Output directory of trace artifacts
+ output_dir: ${output_dir}/profiling_outputs
+
+ #`torch.profiler.ProfilerActivity` types to trace
+ cpu: True
+ cuda: True
+
+ #trace options passed to `torch.profiler.profile`
+ profile_memory: False
+ with_stack: False
+ record_shapes: True
+ with_flops: False
+
+ # `torch.profiler.schedule` options:
+ # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+ wait_steps: 5
+ warmup_steps: 5
+ active_steps: 2
+ num_cycles: 1
diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml
@@ -83,6 +83,25 @@ enable_activation_checkpointing: True
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
 profiler:
- _component_: torchtune.utils.profiler
+ _component_: torchtune.utils.setup_torch_profiler
  enabled: False
- output_dir: ${output_dir}/torchtune_perf_tracing.json
+
+ #Output directory of trace artifacts
+ output_dir: ${output_dir}/profiling_outputs
+
+ #`torch.profiler.ProfilerActivity` types to trace
+ cpu: True
+ cuda: True
+
+ #trace options passed to `torch.profiler.profile`
+ profile_memory: False
+ with_stack: False
+ record_shapes: True
+ with_flops: False
+
+ # `torch.profiler.schedule` options:
+ # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+ wait_steps: 5
+ warmup_steps: 5
+ active_steps: 2
+ num_cycles: 1
diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml
@@ -82,6 +82,25 @@ enable_activation_checkpointing: True
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
 profiler:
- _component_: torchtune.utils.profiler
+ _component_: torchtune.utils.setup_torch_profiler
  enabled: False
- output_dir: ${output_dir}/torchtune_perf_tracing.json
+
+ #Output directory of trace artifacts
+ output_dir: ${output_dir}/profiling_outputs
+
+ #`torch.profiler.ProfilerActivity` types to trace
+ cpu: True
+ cuda: True
+
+ #trace options passed to `torch.profiler.profile`
+ profile_memory: False
+ with_stack: False
+ record_shapes: True
+ with_flops: False
+
+ # `torch.profiler.schedule` options:
+ # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+ wait_steps: 5
+ warmup_steps: 5
+ active_steps: 2
+ num_cycles: 1
diff --git a/recipes/configs/llama3/8B_lora_single_device.yaml b/recipes/configs/llama3/8B_lora_single_device.yaml
@@ -81,5 +81,25 @@ enable_activation_checkpointing: True
 
 # Profiler (disabled)
 profiler:
- _component_: torchtune.utils.profiler
+ _component_: torchtune.utils.setup_torch_profiler
  enabled: False
+
+ #Output directory of trace artifacts
+ output_dir: ${output_dir}/profiling_outputs
+
+ #`torch.profiler.ProfilerActivity` types to trace
+ cpu: True
+ cuda: True
+
+ #trace options passed to `torch.profiler.profile`
+ profile_memory: False
+ with_stack: False
+ record_shapes: True
+ with_flops: False
+
+ # `torch.profiler.schedule` options:
+ # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+ wait_steps: 5
+ warmup_steps: 5
+ active_steps: 2
+ num_cycles: 1
diff --git a/recipes/configs/llama3/8B_qlora_single_device.yaml b/recipes/configs/llama3/8B_qlora_single_device.yaml
@@ -80,5 +80,25 @@ enable_activation_checkpointing: True
 
 # Profiler (disabled)
 profiler:
- _component_: torchtune.utils.profiler
+ _component_: torchtune.utils.setup_torch_profiler
  enabled: False
+
+ #Output directory of trace artifacts
+ output_dir: ${output_dir}/profiling_outputs
+
+ #`torch.profiler.ProfilerActivity` types to trace
+ cpu: True
+ cuda: True
+
+ #trace options passed to `torch.profiler.profile`
+ profile_memory: False
+ with_stack: False
+ record_shapes: True
+ with_flops: False
+
+ # `torch.profiler.schedule` options:
+ # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+ wait_steps: 5
+ warmup_steps: 5
+ active_steps: 2
+ num_cycles: 1
diff --git a/recipes/configs/mistral/7B_lora_single_device.yaml b/recipes/configs/mistral/7B_lora_single_device.yaml
@@ -89,6 +89,25 @@ log_peak_memory_stats: False
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
 profiler:
- _component_: torchtune.utils.profiler
+ _component_: torchtune.utils.setup_torch_profiler
  enabled: False
- output_dir: /tmp/alpaca-llama2-finetune/torchtune_perf_tracing.json
+
+ #Output directory of trace artifacts
+ output_dir: ${output_dir}/profiling_outputs
+
+ #`torch.profiler.ProfilerActivity` types to trace
+ cpu: True
+ cuda: True
+
+ #trace options passed to `torch.profiler.profile`
+ profile_memory: False
+ with_stack: False
+ record_shapes: True
+ with_flops: False
+
+ # `torch.profiler.schedule` options:
+ # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+ wait_steps: 5
+ warmup_steps: 5
+ active_steps: 2
+ num_cycles: 1