Merge pull request #46 from GilbertLabUCSF/abe-dev

debug, new modules, and code improvements
GilbertLabUCSF · Jun 24, 2024 · 7a63899 · 7a63899
2 parents 93b6196 + fbdfee2
commit 7a63899
Show file tree

Hide file tree

Showing 22 changed files with 554 additions and 266 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -16,7 +16,7 @@ jobs:
  strategy:
  fail-fast: false
  matrix:
- python-version: ["3.9"] # ["3.8", "3.9", "3.10"]
+ python-version: ["3.11"]
 
  steps:
  - uses: actions/checkout@v3

diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -15,7 +15,7 @@ jobs:
  fail-fast: false
  matrix:
  os-version: ["ubuntu-latest"]
- python-version: ["3.9"] # ["3.8", "3.9", "3.10"]
+ python-version: ["3.11"]
 
  steps:
  - uses: actions/checkout@v3

diff --git a/.gitignore b/.gitignore
diff --git a/CanDI/__version__.py b/CanDI/__version__.py
@@ -1 +1 @@
-version = "0.1.1"
+version = "0.2.0"
diff --git a/CanDI/candi/__init__.py b/CanDI/candi/__init__.py
@@ -1,4 +1,6 @@
+from . import load
 from . import data
+
 data = data.Data() #Global object data instantiated on import required for access by GeneQuery Objects
-from . import (Gene, CellLine, Organelle, Cancer, CellLineCluster, GeneCluster)
 
+from .candi import (Gene, CellLine, Organelle, Cancer, CellLineCluster, GeneCluster)
diff --git a/CanDI/candi/candi.py b/CanDI/candi/candi.py
@@ -1,11 +1,11 @@
 # Classes for handling data aggregations
 import operator
-from collections import OrderedDict, MutableSequence
+from collections.abc import MutableSequence
 import itertools as it
 import pandas as pd
 import numpy as np
 from . import data, grabber
-from . import entity
+from ..structures import entity
 
 class SubsetHandler(object):
 

diff --git a/CanDI/candi/data.py b/CanDI/candi/data.py
@@ -14,21 +14,27 @@ class Data(object):
  can be tuned to load specific datasets upon import by editing config.ini
  can call Data.load() to load any specific dataset
  """
- def __init__(self):
+ def __init__(self, config_path='auto', verbose=False):
 
- self._file_path = Path(os.path.dirname(os.path.realpath(__file__))).parent.absolute() / 'setup'
- config_path = self._file_path / 'data/config.ini'
+ if config_path == 'auto':
+ self._file_path = Path(os.path.dirname(os.path.realpath(__file__))).parent.absolute() / 'setup'
+ config_path = self._file_path / 'data/config.ini'
+ elif os.path.exists(config_path) == False:
+ raise FileNotFoundError("Config file not found at {}".format(config_path))
+ elif os.path.exists(config_path) == True:
+ if verbose: print("Using config file at {}".format(config_path))
 
  parser = configparser.ConfigParser() #parses config for data sources
  parser.read(config_path)
 
  self._parser = parser
- #self._verify_install()
+ self._verify_install()
  self._init_sources()
  self._init_depmap_paths()
- # self._init_index_tables()
+ self._init_index_tables()
 
  def _verify_install(self): #ensures data being loaded is present
+ #TODO: add more checks for different data sources
  try:
  assert "depmap_urls" in self._parser.sections()
  except AssertionError:
@@ -91,6 +97,7 @@ def _handle_autoload(method, path):
  df = pd.read_csv(path,
  memory_map=True,
  low_memory=False,
+ sep='\t',
  index_col="DepMap_ID")
 
  elif method == "locations":

diff --git a/CanDI/pipelines/__init__.py b/CanDI/pipelines/__init__.py
diff --git a/CanDI/pipelines/coessentiality/__init__.py b/CanDI/pipelines/coessentiality/__init__.py
diff --git a/CanDI/pipelines/diffexp.py b/CanDI/pipelines/diffexp.py
@@ -0,0 +1,52 @@
+import numpy as np
+import pandas as pd
+import anndata as ad
+
+from pydeseq2.dds import DeseqDataSet
+from pydeseq2.default_inference import DefaultInference
+from pydeseq2.ds import DeseqStats
+from adpbulk import ADPBulk
+
+
+def pseudobulk_by_group(adt, groups, method="mean"):
+ # initialize the object
+ adpb = ADPBulk(adt, groupby=groups, method=method)
+
+ # perform the pseudobulking
+ pseudobulk_matrix = adpb.fit_transform()
+
+ # retrieve the sample metadata (useful for easy incorporation with edgeR)
+ sample_meta = adpb.get_meta()
+
+ out = ad.AnnData(
+ X=pseudobulk_matrix,
+ obs=sample_meta.set_index('SampleName')
+ )
+
+ return out
+
+
+def run_deseq(adata, design, tested_level, ref_level, n_cpus=8):
+
+ inference = DefaultInference(n_cpus=n_cpus)
+
+ dds = DeseqDataSet(
+ counts=adata.to_df().astype(int),
+ metadata=adata.obs,
+ design_factors=design, # compare samples based on the "condition"
+ refit_cooks=True,
+ inference=inference,
+ )
+
+ dds.deseq2()
+
+ stat_res = DeseqStats(
+ dds, 
+ contrast=[design, tested_level, ref_level], 
+ inference=inference
+ )
+ stat_res.summary()
+
+ df = stat_res.results_df
+
+ return df
diff --git a/CanDI/setup/dataverse.py b/CanDI/setup/dataverse.py
@@ -9,6 +9,18 @@
 
 CANDI_DATAVERSE_DOI = 'doi:10.7910/DVN/JIAT0H'
 
+
+### Datasets Metadata ###
+
+coessentiality_dataset_names = [
+ 'genes',
+ # 10273535
+ 'GLS_p',
+ # 10273534
+ 'GLS_sign',
+ # 10273533
+]
+
 depmap_dataset_names = [
  'CCLE_expression',
  'CCLE_fusions',
@@ -22,6 +34,11 @@
 ]
 
 name2type = {
+ # Coessentiality datasets
+ 'genes': 'txt',
+ 'GLS_p': 'npy',
+ 'GLS_sign': 'npy',
+ # DepMap datasets
  'CCLE_expression': 'csv',
  'CCLE_fusions': 'csv',
  'CCLE_gene_cn': 'csv',
@@ -34,6 +51,11 @@
 }
 
 name2id = {
+ # Coessentiality datasets
+ 'genes': 10273535,
+ 'GLS_p': 10273534,
+ 'GLS_sign': 10273533,
+ # DepMap datasets
  'CCLE_expression': 8076862,
  'CCLE_fusions': 10085763,
  'CCLE_gene_cn': 8076861,
@@ -46,6 +68,7 @@
 }
 
 
+### Utility functions ###
 def print_sys(s):
  """system print
 
@@ -55,80 +78,102 @@ def print_sys(s):
  print(s, flush = True, file = sys.stderr)
 
 
-def dataverse_download(url, path, name, types):
- """dataverse download helper with progress bar
-
- Args:
- url (str): the url of the dataset
- path (str): the path to save the dataset
- name (str): the dataset name
- types (dict): a dictionary mapping from the dataset name to the file format
- """
- save_path = os.path.join(path, f"{name}.{types[name]}")
- response = requests.get(url, stream=True)
- total_size_in_bytes = int(response.headers.get("content-length", 0))
- block_size = 1024
- progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
- with open(save_path, "wb") as file:
- for data in response.iter_content(block_size):
- progress_bar.update(len(data))
- file.write(data)
- progress_bar.close()
-
-
-def download_wrapper(name, path, return_type=None):
- """wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files
-
- Args:
- name (str): the rough dataset query name
- path (str): the path to save the dataset
- return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
+### Downloading scripts ###
+
+class Downloader:
+ def __init__(self):
+ pass
+
+ def _dataverse_download(self, url, path, name, types):
+ """dataverse download helper with progress bar
+
+ Args:
+ url (str): the url of the dataset
+ path (str): the path to save the dataset
+ name (str): the dataset name
+ types (dict): a dictionary mapping from the dataset name to the file format
+ """
+ save_path = os.path.join(path, f"{name}.{types[name]}")
+ response = requests.get(url, stream=True)
+ total_size_in_bytes = int(response.headers.get("content-length", 0))
+ block_size = 1024
+ progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+ with open(save_path, "wb") as file:
+ for data in response.iter_content(block_size):
+ progress_bar.update(len(data))
+ file.write(data)
+ progress_bar.close()
+
+
+ def _download_wrapper(self, name, path, return_type=None):
+ """wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files
+
+ Args:
+ name (str): the rough dataset query name
+ path (str): the path to save the dataset
+ return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
+
+ Returns:
+ str: the exact dataset query name
+ """
+ server_path = "https://dataverse.harvard.edu/api/access/datafile/"
+
+ url = server_path + str(name2id[name])
+
+ if not os.path.exists(path):
+ os.mkdir(path)
+
+ file_name = f"{name}.{name2type[name]}"
+
+ if os.path.exists(os.path.join(path, file_name)):
+ print_sys("Found local copy...")
+ os.path.join(path, file_name)
+ else:
+ print_sys("Downloading...")
+ self._dataverse_download(url, path, name, name2type)
+
+ if return_type == "url":
+ return url
+ elif return_type == "name":
+ return file_name
+ elif return_type == ["url", "name"]:
+ return url, file_name
 
- Returns:
- str: the exact dataset query name
- """
- server_path = "https://dataverse.harvard.edu/api/access/datafile/"
-
- url = server_path + str(name2id[name])
-
- if not os.path.exists(path):
- os.mkdir(path)
-
- file_name = f"{name}.{name2type[name]}"
-
- if os.path.exists(os.path.join(path, file_name)):
- print_sys("Found local copy...")
- os.path.join(path, file_name)
- else:
- print_sys("Downloading...")
- dataverse_download(url, path, name, name2type)
 
- if return_type == "url":
- return url
- elif return_type == "name":
- return file_name
- elif return_type == ["url", "name"]:
- return url, file_name
-
-
-def depmap_dataverse_download(path, return_type=None):
- """download all datasets to the path
+ def run(self, path, datasets, return_type=None):
+ """download all datasets to the path
+
+ Args:
+ path (str): the path to save the datasets
+ return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
+ """
+ url_list = []
+ file_names = []
+
+ for name in datasets:
+ url, file_name = self._download_wrapper(name, path, return_type=["url", "name"])
+ url_list.append(url)
+ file_names.append(file_name)
+
+ if return_type == "url":
+ return url_list
+ elif return_type == "name":
+ return file_names
+ elif return_type == ["url", "name"]:
+ return url_list, file_names
+
+
+class DepMapDownloader(Downloader):
+ def __init__(self):
+ super().__init__()
+
+ def download(self, path, return_type=None):
+ return self.run(path, depmap_dataset_names, return_type)
 
- Args:
- path (str): the path to save the datasets
- return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
- """
- url_list = []
- file_names = []
 
- for name in depmap_dataset_names:
- url, file_name = download_wrapper(name, path, return_type=["url", "name"])
- url_list.append(url)
- file_names.append(file_name)
+class CoessentialityDownloader(Downloader):
+ def __init__(self):
+ super().__init__()
 
- if return_type == "url":
- return url_list
- elif return_type == "name":
- return file_names
- elif return_type == ["url", "name"]:
- return url_list, file_names
+ def download(self, path, return_type=None):
+ return self.run(path, coessentiality_dataset_names, return_type)