pytorch · janeyx99 · Aug 21, 2024 · Aug 20, 2024 · Aug 21, 2024 · ebsmothers
diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
@@ -328,6 +328,11 @@ def _setup_profiler(
 
  log.info(f" Profiler config after instantiation: {profiler_cfg}")
 
+ self.profiler_wait_steps = profiler_cfg["wait_steps"]
 return DummyProfiler(), DictConfig({"enabled": False}) 
 return DummyProfiler(), DictConfig({"enabled": False}) 
+ self.profiler_warmup_steps = profiler_cfg["warmup_steps"]
+ self.profiler_active_steps = profiler_cfg["active_steps"]
+ self.profiler_profile_memory = profiler_cfg["profile_memory"]
+
  return profiler
 
  def _setup_model(
@@ -579,6 +584,9 @@ def train(self) -> None:
  ):
  break
 
-
+ # Memory profiling
+ if curr_epoch == 0 and self.profiler_profile_memory and idx == (self.profiler_wait_steps + self.profiler_warmup_steps):
+ torch.cuda.memory._record_memory_history()
-
+ # Memory profiling
+ if curr_epoch == 0 and self.profiler_profile_memory and idx == (self.profiler_wait_steps + self.profiler_warmup_steps):
+ torch.cuda.memory._record_memory_history()
+ if curr_epoch == 0 and self.profiler_profile_memory and idx == self.profiler_wait_steps + self.profiler_warmup_steps:
+ torch.cuda.memory._record_memory_history()
+
  batch = {k: v.to(self._device) for k, v in batch.items()}
  num_tokens += batch["tokens"].numel()
 
@@ -626,6 +634,9 @@ def train(self) -> None:
  num_tokens = 0
  t0 = time.perf_counter()
 
+ if curr_epoch == 0 and self.profiler_profile_memory and idx == self.profiler_wait_steps + self.profiler_warmup_steps + self.profiler_active_steps:
- if curr_epoch == 0 and self.profiler_profile_memory and idx == self.profiler_wait_steps + self.profiler_warmup_steps + self.profiler_active_steps:
+ # Stop memory profiling
+ if curr_epoch == 0 and self.profiler_profile_memory and idx == self.profiler_wait_steps + self.profiler_warmup_steps + self.profiler_active_steps:
- if curr_epoch == 0 and self.profiler_profile_memory and idx == self.profiler_wait_steps + self.profiler_warmup_steps + self.profiler_active_steps:
+ # Stop memory profiling
+ if curr_epoch == 0 and self.profiler_profile_memory and idx == self.profiler_wait_steps + self.profiler_warmup_steps + self.profiler_active_steps:
+ torch.cuda.memory._record_memory_history(enabled=None)
+
  # Step the profiler
  # Note we are stepping each batch, which might not include optimizer step in the trace
  # if the schedule cycle doesn't align with gradient accumulation.

diff --git a/torchtune/utils/_profiler.py b/torchtune/utils/_profiler.py
@@ -64,7 +64,7 @@ def trace_handler(
  The following artifacts are exported:
  - chrome / tensorboard trace - viewable through tensorboard or perfetto.dev / chrome::/tracing
  - trace event table
- - memory timeline if ``profile_memory``
+ - memory timeline and snapshot.pickle if ``profile_memory``
  - stacks if ``with_stack`` (note that ``profile_memory`` requires ``with_stack`` to be ``True``),
  viewable as a flamegraph see (https://pytorch.org/docs/stable/profiler.html#torch.profiler._KinetoProfile.export_stacks).
 
@@ -115,6 +115,8 @@ def trace_handler(
  except Exception as e:
  log.warn(f" Failed to export memory timeline: {e}")
 
+ torch.cuda.memory._dump_snapshot(f"{curr_trace_dir}/rank{rank}_memory_snapshot.pickle")
+
  # Dump stack traces
  if prof.with_stack:
  prof.export_stacks(f"{curr_trace_dir}/rank{rank}_stacks.txt", metric=metric)