Dana-Farber-AIOS · tddough98 · Oct 14, 2022 · Oct 14, 2022 · Oct 14, 2022 · Oct 14, 2022
diff --git a/pathml/core/slide_backends.py b/pathml/core/slide_backends.py
@@ -5,6 +5,7 @@
 
 from io import BytesIO
 
+import dask
 import numpy as np
 import openslide
 from javabridge.jutil import JavaException
@@ -62,8 +63,14 @@ class OpenSlideBackend(SlideBackend):
  def __init__(self, filename):
  logger.info(f"OpenSlideBackend loading file at: {filename}")
  self.filename = filename
- self.slide = openslide.open_slide(filename=filename)
- self.level_count = self.slide.level_count
+
+ @property
+ def slide(self):
+ return openslide.open_slide(filename=self.filename)
+
+ @property
+ def level_count(self):
+ return self.slide.level_count
 
  def __repr__(self):
  return f"OpenSlideBackend('{self.filename}')"
@@ -211,9 +218,10 @@ def generate_tiles(self, shape=3000, stride=None, pad=False, level=0):
  for ix_i in range(n_tiles_i):
  for ix_j in range(n_tiles_j):
  coords = (int(ix_i * stride_i), int(ix_j * stride_j))
- # get image for tile
- tile_im = self.extract_region(location=coords, size=shape, level=level)
- yield pathml.core.tile.Tile(image=tile_im, coords=coords)
+ image = dask.delayed(self.extract_region)(
+ location=coords, size=shape, level=level
+ )
+ yield pathml.core.tile.Tile(image, coords=coords)
 
 
 def _init_logger():
@@ -421,7 +429,10 @@ def extract_region(
  f"Multi-level images not supported with series_as_channels=True. Input 'level={level}' invalid. Use 'level=0'."
  )
 
- javabridge.start_vm(class_path=bioformats.JARS, max_heap_size="100G")
+ javabridge.start_vm(
+ class_path=bioformats.JARS, max_heap_size="100G", run_headless=True
+ )
+
  with bioformats.ImageReader(str(self.filename), perform_init=True) as reader:
  # expand size
  logger.info(f"extracting region with input size = {size}")
@@ -593,27 +604,28 @@ def generate_tiles(self, shape=3000, stride=None, pad=False, level=0, **kwargs):
  for ix_j in range(n_tiles_j):
  coords = (int(ix_i * stride_i), int(ix_j * stride_j))
  if coords[0] + shape[0] < i and coords[1] + shape[1] < j:
- # get image for tile
- tile_im = self.extract_region(
+ image = dask.delayed(self.extract_region)(
  location=coords, size=shape, level=level, **kwargs
  )
-  yield pathml.core.tile.Tile(image=tile_im, coords=coords)
+ # Image on edge and needs to be padded with 0s
  else:
- unpaddedshape = (
+ unpadded_shape = (
  i - coords[0] if coords[0] + shape[0] > i else shape[0],
  j - coords[1] if coords[1] + shape[1] > j else shape[1],
  )
- tile_im = self.extract_region(
- location=coords, size=unpaddedshape, level=level, **kwargs
+ edge_image = dask.delayed(self.extract_region)(
+ location=coords, size=unpadded_shape, level=level, **kwargs
  )
- zeroarrayshape = list(tile_im.shape)
- zeroarrayshape[0], zeroarrayshape[1] = (
- list(shape)[0],
- list(shape)[1],
- )
- padded_im = np.zeros(zeroarrayshape)
- padded_im[: tile_im.shape[0], : tile_im.shape[1], ...] = tile_im
- yield pathml.core.tile.Tile(image=padded_im, coords=coords)
+
+ def pad(image):
+ """Pads edge tiles with zeros."""
+ padded = np.zeros((*shape, *image.shape[:-2]))
+ padded[: image.shape[0], : image.shape[1]] = image
+ return padded
+
+ # Need to delay to use shape of edge_image
+ image = dask.delayed(pad)(edge_image)
+ yield pathml.core.tile.Tile(image=image, coords=coords)
 
 
 class DICOMBackend(SlideBackend):
@@ -653,19 +665,25 @@ def __init__(self, filename):
  f"DICOM metadata: frame_shape={self.frame_shape}, nrows = {self.n_rows}, ncols = {self.n_cols}"
  )
 
- # actual file
- self.fp = DicomFile(self.filename, mode="rb")
- self.fp.is_little_endian = self.transfer_syntax_uid.is_little_endian
- self.fp.is_implicit_VR = self.transfer_syntax_uid.is_implicit_VR
+ fp = self.fp
+
  # need to do this to advance the file to the correct point, at the beginning of the pixels
- self.metadata = dcmread(self.fp, stop_before_pixels=True)
- self.pixel_data_offset = self.fp.tell()
- self.fp.seek(self.pixel_data_offset, 0)
+ self.metadata = dcmread(fp, stop_before_pixels=True)
+ pixel_data_offset = fp.tell()
+ fp.seek(pixel_data_offset, 0)
  # note that reading this tag is necessary to advance the file to correct position
- _ = TupleTag(self.fp.read_tag())
+ _ = TupleTag(fp.read_tag())
  # get basic offset table, to enable reading individual frames without loading entire image
- self.bot = self.get_bot(self.fp)
- self.first_frame = self.fp.tell()
+ self.bot = self.get_bot(fp)
+ self.first_frame = fp.tell()
+
+ @property
+ def fp(self):
+ """actual file"""
+ fp = DicomFile(self.filename, mode="rb")
+ fp.is_little_endian = self.transfer_syntax_uid.is_little_endian
+ fp.is_implicit_VR = self.transfer_syntax_uid.is_implicit_VR
+ return fp
 
  def __repr__(self):
  out = f"DICOMBackend('{self.filename}')\n"
@@ -807,7 +825,9 @@ def _read_frame(self, frame_ix):
  np.ndarray: pixel data of that frame
  """
  frame_offset = self.bot[int(frame_ix)]
- self.fp.seek(self.first_frame + frame_offset, 0)
+ # self.fp refers to a different filelike object each time it is accessed
+ fp = self.fp
+ fp.seek(self.first_frame + frame_offset, 0)
  try:
  stop_at = self.bot[frame_ix + 1] - frame_offset
  except IndexError:
@@ -816,11 +836,11 @@ def _read_frame(self, frame_ix):
  # A frame may comprised of multiple chunks
  chunks = []
  while True:
- tag = TupleTag(self.fp.read_tag())
+ tag = TupleTag(fp.read_tag())
  if n == stop_at or int(tag) == SequenceDelimiterTag:
  break
- length = self.fp.read_UL()
- chunks.append(self.fp.read(length))
+ length = fp.read_UL()
+ chunks.append(fp.read(length))
  n += 8 + length
 
  frame_bytes = b"".join(chunks)
@@ -899,7 +919,7 @@ def generate_tiles(self, shape, stride, pad, level=0, **kwargs):
  if i >= (self.n_frames - self.n_cols):
  continue
 
- frame_im = self.extract_region(location=i)
+ im = dask.delayed(self.extract_region)(location=i)
  coords = self._index_to_coords(i)
- frame_tile = pathml.core.tile.Tile(image=frame_im, coords=coords)
- yield frame_tile
+ tile = pathml.core.tile.Tile(image=im, coords=coords)
+ yield tile
diff --git a/pathml/core/slide_data.py b/pathml/core/slide_data.py
@@ -17,6 +17,7 @@
 import pathml.core
 import pathml.preprocessing.pipeline
 from pathml.core.slide_types import SlideType
+from pathml.preprocessing.transforms import DropTileException
 
 
 def infer_backend(path):
@@ -309,31 +310,47 @@ def run(
  )
 
  # map pipeline application onto each tile
- processed_tile_futures = []
+ futures = [
+ client.submit(pipeline.apply, tile)
+ for tile in self.generate_tiles(
+ level=level,
+ shape=tile_size,
+ stride=tile_stride,
+ pad=tile_pad,
+ **kwargs,
+ )
+ ]
 
- for tile in self.generate_tiles(
- level=level,
- shape=tile_size,
- stride=tile_stride,
- pad=tile_pad,
- **kwargs,
- ):
- if not tile.slide_type:
- tile.slide_type = self.slide_type
- # explicitly scatter data, i.e. send the tile data out to the cluster before applying the pipeline
- # according to dask, this can reduce scheduler burden and keep data on workers
- big_future = client.scatter(tile)
- f = client.submit(pipeline.apply, big_future)
- processed_tile_futures.append(f)
-
- # as tiles are processed, add them to h5
- for future, tile in dask.distributed.as_completed(
- processed_tile_futures, with_results=True
+ # After a worker processes a tile, add the tile to h5
+ for future, result in dask.distributed.as_completed(
+ futures, with_results=True, raise_errors=False
  ):
- self.tiles.add(tile)
+ if future.status == "finished":
+ self.tiles.add(result)
+ if future.status == "error":
+ typ, exc, tb = result
+ if typ is DropTileException:
+ pass
+ else:
+ raise exc.with_traceback(tb)
+ # TODO: Free memory used for tile
+ # Each in-memory future holding a Tile shows a size of 48 bytes on the Dask dashboard
+ # which clearly does not include image data.
+ # Could it be that loaded image data is somehow not being garbage collected with Tiles?
+
+ # # all of these still leave unmanaged memory on each worker
+ # future.release()
+ # future.cancel()
+ # del result
+ # del future
+ # del futures
 
  if shutdown_after:
  client.shutdown()
+ else:
+ pass
+ # Stopgap to free unmanaged memory on client before processing another slide
+ client.restart()
 
  else:
  for tile in self.generate_tiles(
@@ -343,8 +360,6 @@ def run(
  pad=tile_pad,
  **kwargs,
  ):
- if not tile.slide_type:
- tile.slide_type = self.slide_type
  pipeline.apply(tile)
  self.tiles.add(tile)
 
@@ -410,14 +425,19 @@ def generate_tiles(self, shape=3000, stride=None, pad=False, **kwargs):
  pathml.core.tile.Tile: Extracted Tile object
  """
  for tile in self.slide.generate_tiles(shape, stride, pad, **kwargs):
+ # TODO: move to worker!! (forces loading data on main thread)
+
  # add masks for tile, if possible
  # i.e. if the SlideData has a Masks object, and the tile has coordinates
  if self.masks is not None and tile.coords is not None:
  # masks not supported if pad=True
  # to implement, need to update Mask.slice to support slices that go beyond the full mask
  if not pad:
  i, j = tile.coords
- di, dj = tile.image.shape[0:2]
+ # Accessing image loads data on main thread
+ # dask.delayed waits until compute is called on worker
+ shape = dask.delayed(tile).image.shape[0:2]
+ di, dj = shape[0], shape[1]
  # add the Masks object for the masks corresponding to the tile
  # this assumes that the tile didn't already have any masks
  # this should work since the backend reads from image only
@@ -430,6 +450,8 @@ def generate_tiles(self, shape=3000, stride=None, pad=False, **kwargs):
  tile_slices = [slice(i, i + di), slice(j, j + dj)]
  tile.masks = self.masks.slice(tile_slices)
 
+ # TODO: end move to worker
+
  # add slide-level labels to each tile, if possible
  if self.labels is not None:
  tile.labels = self.labels

diff --git a/pathml/core/tile.py b/pathml/core/tile.py
@@ -7,9 +7,11 @@
 from collections import OrderedDict
 
 import anndata
+import dask
 import h5py
 import matplotlib.pyplot as plt
 import numpy as np
+from dask.delayed import Delayed
 
 import pathml.core.masks
 
@@ -21,7 +23,7 @@ class Tile:
  on labelling the top-leftmost pixel as (0, 0)
 
  Args:
- image (np.ndarray): Image array of tile
+ image (np.ndarray or dask.delayed.Delayed): Tile image or dask.delayed.Delayed object to load image
  coords (tuple): Coordinates of tile relative to the whole-slide image.
  The (i,j) coordinate system is based on labelling the top-leftmost pixel of the WSI as (0, 0).
  name (str, optional): Name of tile
@@ -60,9 +62,9 @@ def __init__(
  time_series=None,
  ):
  # check inputs
- assert isinstance(
+ assert isinstance(image, Delayed) or isinstance(
  image, np.ndarray
- ), f"image of type {type(image)} must be a np.ndarray"
+ ), f"image of type {type(image)} must be a np.ndarray or a dask.delayed.Delayed object"
  assert masks is None or isinstance(
  masks, dict
  ), f"masks is of type {type(masks)} but must be of type dict"
@@ -115,23 +117,38 @@ def __init__(
  counts, anndata.AnnData
  ), f"counts is of type {type(counts)} but must be of type anndata.AnnData or None"
 
- if masks:
- for val in masks.values():
- if val.shape[:2] != image.shape[:2]:
- raise ValueError(
- f"mask is of shape {val.shape} but must match tile shape {image.shape}"
- )
- self.masks = masks
- else:
- self.masks = OrderedDict()
-
- self.image = image
+ self._image = image
+ self.masks = masks if masks else OrderedDict()
  self.name = name
  self.coords = coords
  self.slide_type = slide_type
  self.labels = labels
  self.counts = counts
 
+ @property
+ def image(self):
+ if isinstance(self._image, Delayed):
+ image = dask.compute(self._image, scheduler="single-threaded")
+ if isinstance(image, tuple):
+ image = image[0]
+ assert isinstance(
+ image, np.ndarray
+ ), f"image of type {type(image)} must be a np.ndarray"
+ for val in self.masks.values():
+ if val.shape[:2] != image.shape[:2]:
+ raise ValueError(
+ f"mask is of shape {val.shape} but must match tile shape {image.shape}"
+ )
+ self._image = image
+ return self._image
+
+ @image.setter
+ def image(self, image):
+ assert isinstance(
+ image, np.ndarray
+ ), f"image of type {type(image)} must be a np.ndarray"
+ self._image = image
+
  def __repr__(self):
  out = []
  out.append(f"Tile(coords={self.coords}")