Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat (MM): Custom CSV Importer for MM #560

Closed
wants to merge 14 commits into from
2 changes: 1 addition & 1 deletion lib/workload/stateless/stacks/metadata-manager/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ on the model of the record.

## How things work

### How Syncing The Data Works
### How Tracking Sheet Syncing Works

In the near future, we might introduce different ways to load data into the application. For the time being, we are
loading data
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import logging
from django.core.management import BaseCommand
from libumccr import libjson

from handler.load_custom_metadata_csv import handler

logger = logging.getLogger()
logger.setLevel(logging.INFO)


class Command(BaseCommand):
help = "Trigger lambda handler for to sync metadata from csv url"

def handle(self, *args, **options):
event = {
"url" :"SOME_URL",
}

print(f"Trigger lambda handler for sync tracking sheet. Event {libjson.dumps(event)}")
result = handler(event, {})

print(f"result: {libjson.dumps(result)}")
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,106 @@ class Migration(migrations.Migration):
},
),
migrations.CreateModel(
name='Subject',
name='HistoricalContact',
fields=[
('orcabus_id', models.CharField(editable=False, primary_key=True, serialize=False, unique=True, validators=[django.core.validators.RegexValidator(code='invalid_orcabus_id', message='ULID is expected to be 26 characters long', regex='[\\w]{26}$')])),
('subject_id', models.CharField(blank=True, null=True, unique=True)),
],
options={
'abstract': False,
'verbose_name': 'historical contact',
'verbose_name_plural': 'historical contacts',
'ordering': ('-history_date', '-history_id'),
'get_latest_by': ('history_date', 'history_id'),
},
bases=(simple_history.models.HistoricalChanges, models.Model),
),
migrations.CreateModel(
name='HistoricalIndividual',
fields=[
('orcabus_id', models.CharField(db_index=True, editable=False, validators=[django.core.validators.RegexValidator(code='invalid_orcabus_id', message='ULID is expected to be 26 characters long', regex='[\\w]{26}$')])),
('individual_id', models.CharField(blank=True, db_index=True, null=True)),
('source', models.CharField(blank=True, null=True)),
('history_id', models.AutoField(primary_key=True, serialize=False)),
('history_date', models.DateTimeField(db_index=True)),
('history_change_reason', models.CharField(max_length=100, null=True)),
('history_type', models.CharField(choices=[('+', 'Created'), ('~', 'Changed'), ('-', 'Deleted')], max_length=1)),
('history_user', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='+', to=settings.AUTH_USER_MODEL)),
],
options={
'verbose_name': 'historical individual',
'verbose_name_plural': 'historical individuals',
'ordering': ('-history_date', '-history_id'),
'get_latest_by': ('history_date', 'history_id'),
},
bases=(simple_history.models.HistoricalChanges, models.Model),
),
migrations.CreateModel(
name='HistoricalLibrary',
fields=[
('orcabus_id', models.CharField(db_index=True, editable=False, validators=[django.core.validators.RegexValidator(code='invalid_orcabus_id', message='ULID is expected to be 26 characters long', regex='[\\w]{26}$')])),
('library_id', models.CharField(blank=True, db_index=True, null=True)),
('phenotype', models.CharField(blank=True, choices=[('normal', 'Normal'), ('tumor', 'Tumor'), ('negative-control', 'Negative Control')], null=True)),
('workflow', models.CharField(blank=True, choices=[('clinical', 'Clinical'), ('research', 'Research'), ('qc', 'Qc'), ('control', 'Control'), ('bcl', 'Bcl'), ('manual', 'Manual')], null=True)),
('quality', models.CharField(blank=True, choices=[('very-poor', 'VeryPoor'), ('poor', 'Poor'), ('good', 'Good'), ('borderline', 'Borderline')], null=True)),
('type', models.CharField(blank=True, choices=[('10X', 'Ten X'), ('BiModal', 'Bimodal'), ('ctDNA', 'Ct Dna'), ('ctTSO', 'Ct Tso'), ('exome', 'Exome'), ('MeDIP', 'Me Dip'), ('Metagenm', 'Metagenm'), ('MethylSeq', 'Methyl Seq'), ('TSO-DNA', 'TSO_DNA'), ('TSO-RNA', 'TSO_RNA'), ('WGS', 'Wgs'), ('WTS', 'Wts'), ('other', 'Other')], null=True)),
('assay', models.CharField(blank=True, null=True)),
('coverage', models.FloatField(blank=True, null=True)),
('history_id', models.AutoField(primary_key=True, serialize=False)),
('history_date', models.DateTimeField(db_index=True)),
('history_change_reason', models.CharField(max_length=100, null=True)),
('history_type', models.CharField(choices=[('+', 'Created'), ('~', 'Changed'), ('-', 'Deleted')], max_length=1)),
('history_user', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='+', to=settings.AUTH_USER_MODEL)),
('sample', models.ForeignKey(blank=True, db_constraint=False, null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='+', to='app.sample')),
],
options={
'verbose_name': 'historical library',
'verbose_name_plural': 'historical librarys',
'ordering': ('-history_date', '-history_id'),
'get_latest_by': ('history_date', 'history_id'),
},
bases=(simple_history.models.HistoricalChanges, models.Model),
),
migrations.CreateModel(
name='HistoricalProject',
fields=[
('orcabus_id', models.CharField(db_index=True, editable=False, validators=[django.core.validators.RegexValidator(code='invalid_orcabus_id', message='ULID is expected to be 26 characters long', regex='[\\w]{26}$')])),
('project_id', models.CharField(blank=True, db_index=True, null=True)),
('name', models.CharField(blank=True, null=True)),
('description', models.CharField(blank=True, null=True)),
('history_id', models.AutoField(primary_key=True, serialize=False)),
('history_date', models.DateTimeField(db_index=True)),
('history_change_reason', models.CharField(max_length=100, null=True)),
('history_type', models.CharField(choices=[('+', 'Created'), ('~', 'Changed'), ('-', 'Deleted')], max_length=1)),
('history_user', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='+', to=settings.AUTH_USER_MODEL)),
],
options={
'verbose_name': 'historical project',
'verbose_name_plural': 'historical projects',
'ordering': ('-history_date', '-history_id'),
'get_latest_by': ('history_date', 'history_id'),
},
bases=(simple_history.models.HistoricalChanges, models.Model),
),
migrations.CreateModel(
name='HistoricalSample',
fields=[
('orcabus_id', models.CharField(db_index=True, editable=False, validators=[django.core.validators.RegexValidator(code='invalid_orcabus_id', message='ULID is expected to be 26 characters long', regex='[\\w]{26}$')])),
('sample_id', models.CharField(blank=True, db_index=True, null=True)),
('external_sample_id', models.CharField(blank=True, null=True)),
('source', models.CharField(blank=True, choices=[('ascites', 'Ascites'), ('blood', 'Blood'), ('bone-marrow', 'BoneMarrow'), ('buccal', 'Buccal'), ('cell-line', 'Cell_line'), ('cfDNA', 'Cfdna'), ('cyst-fluid', 'Cyst Fluid'), ('DNA', 'Dna'), ('eyebrow-hair', 'Eyebrow Hair'), ('FFPE', 'Ffpe'), ('FNA', 'Fna'), ('OCT', 'Oct'), ('organoid', 'Organoid'), ('PDX-tissue', 'Pdx Tissue'), ('plasma-serum', 'Plasma Serum'), ('RNA', 'Rna'), ('tissue', 'Tissue'), ('skin', 'Skin'), ('water', 'Water')], null=True)),
('history_id', models.AutoField(primary_key=True, serialize=False)),
('history_date', models.DateTimeField(db_index=True)),
('history_change_reason', models.CharField(max_length=100, null=True)),
('history_type', models.CharField(choices=[('+', 'Created'), ('~', 'Changed'), ('-', 'Deleted')], max_length=1)),
('history_user', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='+', to=settings.AUTH_USER_MODEL)),
],
options={
'verbose_name': 'historical sample',
'verbose_name_plural': 'historical samples',
'ordering': ('-history_date', '-history_id'),
'get_latest_by': ('history_date', 'history_id'),
},
bases=(simple_history.models.HistoricalChanges, models.Model),
),
migrations.CreateModel(
name='HistoricalContact',
Expand Down Expand Up @@ -228,6 +320,31 @@ class Migration(migrations.Migration):
('project', models.ForeignKey(db_column='project_orcabus_id', on_delete=django.db.models.deletion.CASCADE, to='app.project')),
],
),
migrations.CreateModel(
name='HistoricalLibrary_project_set',
fields=[
('id', models.BigIntegerField(auto_created=True, blank=True, db_index=True, verbose_name='ID')),
('m2m_history_id', models.AutoField(primary_key=True, serialize=False)),
('history', models.ForeignKey(db_constraint=False, on_delete=django.db.models.deletion.DO_NOTHING, to='app.historicallibrary')),
('library', models.ForeignKey(blank=True, db_constraint=False, db_tablespace='', null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='+', to='app.library')),
('project', models.ForeignKey(blank=True, db_constraint=False, db_tablespace='', null=True, on_delete=django.db.models.deletion.DO_NOTHING, related_name='+', to='app.project')),
],
options={
'verbose_name': 'HistoricalLibrary_project_set',
},
bases=(simple_history.models.HistoricalChanges, models.Model),
),
migrations.CreateModel(
name='Subject',
fields=[
('orcabus_id', models.CharField(editable=False, primary_key=True, serialize=False, unique=True, validators=[django.core.validators.RegexValidator(code='invalid_orcabus_id', message='ULID is expected to be 26 characters long', regex='[\\w]{26}$')])),
('subject_id', models.CharField(blank=True, null=True, unique=True)),
('individual_set', models.ManyToManyField(blank=True, related_name='subject_set', to='app.individual')),
],
options={
'abstract': False,
},
),
migrations.AddField(
model_name='library',
name='project_set',
Expand Down
32 changes: 32 additions & 0 deletions lib/workload/stateless/stacks/metadata-manager/deploy/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,35 @@ aws lambda invoke \
--cli-binary-format raw-in-base64-out \
res.json
```

### CustomCsvLambda

- Load tracking sheet data from csv presigned url

To manually trigger the sync, the lambda ARN is stored in the SSM Parameter Store named
`/orcabus/metadata-manager/load-custom-csv-lambda-arn`.

To query in a local terminal

```sh
load_custom_csv_lambda_arn=$(aws ssm get-parameter --name '/orcabus/metadata-manager/load-custom-csv-lambda-arn' --with-decryption | jq -r .Parameter.Value)
```

The lambda handler will accept a json which only accepts a single key `url` which is the presigned url of the csv file.

```json
{
"url": "https://example.com/csv"
}
```

Invoking lambda cmd:

```sh
aws lambda invoke \
--function-name load_custom_csv_lambda_arn \
--invocation-type Event \
--payload '{ "url": "https://the.url.csv" }' \
--cli-binary-format raw-in-base64-out \
res.json
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import path from 'path';
import { Construct } from 'constructs';
import { Duration } from 'aws-cdk-lib';
import { PythonFunction } from '@aws-cdk/aws-lambda-python-alpha';
import { ISecret } from 'aws-cdk-lib/aws-secretsmanager';
import { StringParameter } from 'aws-cdk-lib/aws-ssm';
import {
DockerImageFunction,
DockerImageFunctionProps,
DockerImageCode,
} from 'aws-cdk-lib/aws-lambda';

type LambdaProps = {
/**
* The basic common lambda properties that it should inherit from
*/
basicLambdaConfig: Partial<DockerImageFunctionProps>;
/**
* The secret for the db connection where the lambda will need access to
*/
dbConnectionSecret: ISecret;
};

export class LambdaLoadCustomCSVConstruct extends Construct {
private readonly lambda: PythonFunction;

constructor(scope: Construct, id: string, lambdaProps: LambdaProps) {
super(scope, id);

this.lambda = new DockerImageFunction(this, 'LoadCustomCSVLambda', {
environment: {
...lambdaProps.basicLambdaConfig.environment,
},
securityGroups: lambdaProps.basicLambdaConfig.securityGroups,
vpc: lambdaProps.basicLambdaConfig.vpc,
vpcSubnets: lambdaProps.basicLambdaConfig.vpcSubnets,
architecture: lambdaProps.basicLambdaConfig.architecture,
code: DockerImageCode.fromImageAsset(path.join(__dirname, '../../../'), {
file: 'deploy/construct/lambda-load-custom-csv/lambda.Dockerfile',
}),
timeout: Duration.minutes(15),
memorySize: 4096,
});

lambdaProps.dbConnectionSecret.grantRead(this.lambda);

// We need to store this lambda ARN somewhere so that we could refer when need to load this manually
const ssmParameter = new StringParameter(this, 'LoadCustomCSVLambdaArnParameterStore', {
parameterName: '/orcabus/metadata-manager/load-custom-csv-lambda-arn',
description: 'The ARN of the lambda that load metadata from a presigned URL CSV file',
stringValue: this.lambda.functionArn,
});
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM public.ecr.aws/lambda/python:3.12

WORKDIR ${LAMBDA_TASK_ROOT}

# COPY all files
COPY . .

# Install the specified packages
RUN pip install -r deps/requirements-full.txt

# Specify handler
CMD [ "handler.load_custom_metadata_csv.handler" ]
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import django
import os
import logging

from libumccr import libjson

from proc.service.utils import sanitize_lab_metadata_df, warn_drop_duplicated_library

os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings.base')
django.setup()

from proc.service.load_csv_srv import load_metadata_csv, download_csv_to_pandas

logger = logging.getLogger()
logger.setLevel(logging.INFO)


def handler(event, _context):
logger.info(f'event: {libjson.dumps(event)}')

csv_url = event.get('url', None)
if csv_url is None:
raise ValueError("URL is required")

csv_df = download_csv_to_pandas(csv_url)
sanitize_df = sanitize_lab_metadata_df(csv_df)
duplicate_clean_df = warn_drop_duplicated_library(sanitize_df)
result = load_metadata_csv(duplicate_clean_df)

logger.info(f'persist report: {libjson.dumps(result)}')
return result


if __name__ == '__main__':
handler({}, {})
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings.base')
django.setup()

from proc.service.tracking_sheet_srv import download_tracking_sheet, sanitize_lab_metadata_df, persist_lab_metadata, \
warn_drop_duplicated_library
from proc.service.tracking_sheet_srv import download_tracking_sheet, persist_lab_metadata
from proc.service.utils import sanitize_lab_metadata_df, warn_drop_duplicated_library

logger = logging.getLogger()
logger.setLevel(logging.INFO)
Expand Down
Loading