Skip to content

Commit

Permalink
Merge pull request #46 from GilbertLabUCSF/abe-dev
Browse files Browse the repository at this point in the history
debug, new modules, and code improvements
  • Loading branch information
abearab authored Jun 24, 2024
2 parents 93b6196 + fbdfee2 commit 7a63899
Show file tree
Hide file tree
Showing 22 changed files with 554 additions and 266 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.9"] # ["3.8", "3.9", "3.10"]
python-version: ["3.11"]

steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
fail-fast: false
matrix:
os-version: ["ubuntu-latest"]
python-version: ["3.9"] # ["3.8", "3.9", "3.10"]
python-version: ["3.11"]

steps:
- uses: actions/checkout@v3
Expand Down
Binary file modified .gitignore
Binary file not shown.
2 changes: 1 addition & 1 deletion CanDI/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
version = "0.1.1"
version = "0.2.0"
4 changes: 3 additions & 1 deletion CanDI/candi/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from . import load
from . import data

data = data.Data() #Global object data instantiated on import required for access by GeneQuery Objects
from . import (Gene, CellLine, Organelle, Cancer, CellLineCluster, GeneCluster)

from .candi import (Gene, CellLine, Organelle, Cancer, CellLineCluster, GeneCluster)
4 changes: 2 additions & 2 deletions CanDI/candi/candi.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Classes for handling data aggregations
import operator
from collections import OrderedDict, MutableSequence
from collections.abc import MutableSequence
import itertools as it
import pandas as pd
import numpy as np
from . import data, grabber
from . import entity
from ..structures import entity

class SubsetHandler(object):

Expand Down
17 changes: 12 additions & 5 deletions CanDI/candi/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,27 @@ class Data(object):
can be tuned to load specific datasets upon import by editing config.ini
can call Data.load() to load any specific dataset
"""
def __init__(self):
def __init__(self, config_path='auto', verbose=False):

self._file_path = Path(os.path.dirname(os.path.realpath(__file__))).parent.absolute() / 'setup'
config_path = self._file_path / 'data/config.ini'
if config_path == 'auto':
self._file_path = Path(os.path.dirname(os.path.realpath(__file__))).parent.absolute() / 'setup'
config_path = self._file_path / 'data/config.ini'
elif os.path.exists(config_path) == False:
raise FileNotFoundError("Config file not found at {}".format(config_path))
elif os.path.exists(config_path) == True:
if verbose: print("Using config file at {}".format(config_path))

parser = configparser.ConfigParser() #parses config for data sources
parser.read(config_path)

self._parser = parser
#self._verify_install()
self._verify_install()
self._init_sources()
self._init_depmap_paths()
# self._init_index_tables()
self._init_index_tables()

def _verify_install(self): #ensures data being loaded is present
#TODO: add more checks for different data sources
try:
assert "depmap_urls" in self._parser.sections()
except AssertionError:
Expand Down Expand Up @@ -91,6 +97,7 @@ def _handle_autoload(method, path):
df = pd.read_csv(path,
memory_map=True,
low_memory=False,
sep='\t',
index_col="DepMap_ID")

elif method == "locations":
Expand Down
Empty file added CanDI/pipelines/__init__.py
Empty file.
Empty file.
52 changes: 52 additions & 0 deletions CanDI/pipelines/diffexp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import numpy as np
import pandas as pd
import anndata as ad

from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.ds import DeseqStats
from adpbulk import ADPBulk


def pseudobulk_by_group(adt, groups, method="mean"):
# initialize the object
adpb = ADPBulk(adt, groupby=groups, method=method)

# perform the pseudobulking
pseudobulk_matrix = adpb.fit_transform()

# retrieve the sample metadata (useful for easy incorporation with edgeR)
sample_meta = adpb.get_meta()

out = ad.AnnData(
X=pseudobulk_matrix,
obs=sample_meta.set_index('SampleName')
)

return out


def run_deseq(adata, design, tested_level, ref_level, n_cpus=8):

inference = DefaultInference(n_cpus=n_cpus)

dds = DeseqDataSet(
counts=adata.to_df().astype(int),
metadata=adata.obs,
design_factors=design, # compare samples based on the "condition"
refit_cooks=True,
inference=inference,
)

dds.deseq2()

stat_res = DeseqStats(
dds,
contrast=[design, tested_level, ref_level],
inference=inference
)
stat_res.summary()

df = stat_res.results_df

return df
189 changes: 117 additions & 72 deletions CanDI/setup/dataverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,18 @@

CANDI_DATAVERSE_DOI = 'doi:10.7910/DVN/JIAT0H'


### Datasets Metadata ###

coessentiality_dataset_names = [
'genes',
# 10273535
'GLS_p',
# 10273534
'GLS_sign',
# 10273533
]

depmap_dataset_names = [
'CCLE_expression',
'CCLE_fusions',
Expand All @@ -22,6 +34,11 @@
]

name2type = {
# Coessentiality datasets
'genes': 'txt',
'GLS_p': 'npy',
'GLS_sign': 'npy',
# DepMap datasets
'CCLE_expression': 'csv',
'CCLE_fusions': 'csv',
'CCLE_gene_cn': 'csv',
Expand All @@ -34,6 +51,11 @@
}

name2id = {
# Coessentiality datasets
'genes': 10273535,
'GLS_p': 10273534,
'GLS_sign': 10273533,
# DepMap datasets
'CCLE_expression': 8076862,
'CCLE_fusions': 10085763,
'CCLE_gene_cn': 8076861,
Expand All @@ -46,6 +68,7 @@
}


### Utility functions ###
def print_sys(s):
"""system print
Expand All @@ -55,80 +78,102 @@ def print_sys(s):
print(s, flush = True, file = sys.stderr)


def dataverse_download(url, path, name, types):
"""dataverse download helper with progress bar
Args:
url (str): the url of the dataset
path (str): the path to save the dataset
name (str): the dataset name
types (dict): a dictionary mapping from the dataset name to the file format
"""
save_path = os.path.join(path, f"{name}.{types[name]}")
response = requests.get(url, stream=True)
total_size_in_bytes = int(response.headers.get("content-length", 0))
block_size = 1024
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
with open(save_path, "wb") as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()


def download_wrapper(name, path, return_type=None):
"""wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files
Args:
name (str): the rough dataset query name
path (str): the path to save the dataset
return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
### Downloading scripts ###

class Downloader:
def __init__(self):
pass

def _dataverse_download(self, url, path, name, types):
"""dataverse download helper with progress bar
Args:
url (str): the url of the dataset
path (str): the path to save the dataset
name (str): the dataset name
types (dict): a dictionary mapping from the dataset name to the file format
"""
save_path = os.path.join(path, f"{name}.{types[name]}")
response = requests.get(url, stream=True)
total_size_in_bytes = int(response.headers.get("content-length", 0))
block_size = 1024
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
with open(save_path, "wb") as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()


def _download_wrapper(self, name, path, return_type=None):
"""wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files
Args:
name (str): the rough dataset query name
path (str): the path to save the dataset
return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
Returns:
str: the exact dataset query name
"""
server_path = "https://dataverse.harvard.edu/api/access/datafile/"

url = server_path + str(name2id[name])

if not os.path.exists(path):
os.mkdir(path)

file_name = f"{name}.{name2type[name]}"

if os.path.exists(os.path.join(path, file_name)):
print_sys("Found local copy...")
os.path.join(path, file_name)
else:
print_sys("Downloading...")
self._dataverse_download(url, path, name, name2type)

if return_type == "url":
return url
elif return_type == "name":
return file_name
elif return_type == ["url", "name"]:
return url, file_name

Returns:
str: the exact dataset query name
"""
server_path = "https://dataverse.harvard.edu/api/access/datafile/"

url = server_path + str(name2id[name])

if not os.path.exists(path):
os.mkdir(path)

file_name = f"{name}.{name2type[name]}"

if os.path.exists(os.path.join(path, file_name)):
print_sys("Found local copy...")
os.path.join(path, file_name)
else:
print_sys("Downloading...")
dataverse_download(url, path, name, name2type)

if return_type == "url":
return url
elif return_type == "name":
return file_name
elif return_type == ["url", "name"]:
return url, file_name


def depmap_dataverse_download(path, return_type=None):
"""download all datasets to the path
def run(self, path, datasets, return_type=None):
"""download all datasets to the path
Args:
path (str): the path to save the datasets
return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
"""
url_list = []
file_names = []

for name in datasets:
url, file_name = self._download_wrapper(name, path, return_type=["url", "name"])
url_list.append(url)
file_names.append(file_name)

if return_type == "url":
return url_list
elif return_type == "name":
return file_names
elif return_type == ["url", "name"]:
return url_list, file_names


class DepMapDownloader(Downloader):
def __init__(self):
super().__init__()

def download(self, path, return_type=None):
return self.run(path, depmap_dataset_names, return_type)

Args:
path (str): the path to save the datasets
return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"]
"""
url_list = []
file_names = []

for name in depmap_dataset_names:
url, file_name = download_wrapper(name, path, return_type=["url", "name"])
url_list.append(url)
file_names.append(file_name)
class CoessentialityDownloader(Downloader):
def __init__(self):
super().__init__()

if return_type == "url":
return url_list
elif return_type == "name":
return file_names
elif return_type == ["url", "name"]:
return url_list, file_names
def download(self, path, return_type=None):
return self.run(path, coessentiality_dataset_names, return_type)
Loading

0 comments on commit 7a63899

Please sign in to comment.