Source code for pulsarpy_to_encodedcc.dcc_submit

# -*- coding: utf-8 -*-

###
# © 2018 The Board of Trustees of the Leland Stanford Junior University
# Nathaniel Watson
# nathankw@stanford.edu
###

"""
Required environment variables
  1) Those that are required in the pulsarpy.models module for connecting to Pulsar LIMS:
     -PULSAR_API_URL
     -PULSAR_TOKEN
  2) Those that are required in the encode_utils.connection module to connect to the ENCODE Portal:
     -DCC_API_KEY
     -DCC_SECRET_KEY

Optional environment variables:
  1) DCC_MODE_ - Specifies which ENCODE Portal host to connect to. If not set, then must be provided
     when instantiating the Submit() class.

..  _DCC_MODE: https://encode-utils.readthedocs.io/en/latest/connection.html#encode_utils.connection.Connection.dcc_mode

"""

import base64
import logging
import os
import re
import requests
import sys

import dxpy

import pulsarpy_to_encodedcc
from pulsarpy_to_encodedcc import FASTQ_FOLDER, log_error
from pulsarpy import models
import pulsarpy.utils
import encode_utils as eu
import encode_utils.aws_storage
import encode_utils.replicate
import encode_utils.connection as euc
import encode_utils.utils as euu
import pdb

error_logger = logging.getLogger(pulsarpy_to_encodedcc.ERROR_LOGGER_NAME)
# regex for finding one or more continuous spaces
space_reg = re.compile(r' +')


[docs]class IpLaneException(Exception): """ Raised when posting an IP. The is a temporary class that'll be removed once the exceptions are handled property in the Submit.post_ip_lane method. """
[docs]class ExpMissingReplicates(Exception): """ Raised when trying to POST an experiment to the Portal (such as a control experiment) and there aren't any replicates (Biosample records) to attach to it. """
[docs]class MissingSequencingResult(Exception): pass
[docs]class MissingTargetUpstream(Exception): """ Raised when submitting a record that tries to link to a DCC target, but the target record in Pulsar doesn't have the upstream_identifier attribute set. """
[docs]class UpstreamNotSet(Exception): pass
[docs]class NoFastqFile(Exception): """ Raised in Submit.post_fastq_file() when submitting either a R1 FASTQ file or a R2 FASTQ file, and the filepath isn't set in the corresponding SequencingResult record in Pulsar. """ pass
#def dec(model_class): # """ # A decorator that is to be used with the post_* methods defined in the Submit class defined below. # """ # # def wrapper(func): # # def inner(self, rec_id, patch=False, *args, **kwargs): # """ # Saves time by checking whether a record needs to be posted to the Portal before it's # payload is constructed. It need not be posted if in Pulsar it has the upstream_identifier # attribute set AND the 'patch' argument is False. The decorated method will thus only run # if upstream_identifier isn't set, or if the 'patch' argument is True. # """ # rec = model_class(rec_id) # upstream = rec.get_upstream() # if upstream and not patch: # # Then no need to post # return upstream # else: # self.func(rec_id=rec_id, patch=patch, *args, **kwargs) # return inner # # return wrapper
[docs]class Submit(): """ Contains methods for submitting various types of objects in Pulsar to the ENCODE Portal. """ def __init__(self, dcc_mode=None, extend_arrays=True): if not dcc_mode: try: dcc_mode = os.environ["DCC_MODE"] print("Utilizing DCC_MODE environment variable.") except KeyError: print("ERROR: You must supply the `dcc_mode` argument or set the environment variable DCC_MODE.") sys.exit(-1) self.dcc_mode = dcc_mode self.ENC_CONN = euc.Connection(self.dcc_mode, submission=True) #: When patching, there is the option to extend array properties or overwrite their values. #: The default is to extend. self.extend_arrays = extend_arrays def filter_standard_attrs(self, payload): attrs = ["created_at", "id", "owner_id", "updated_at", "user_id"] for i in attrs: if i in payload: payload.pop(i) for i in payload: if i.startswith("_"): payload.pop(i) return payload
[docs] def sanitize_prop_val(self, txt): """ Replaces characters that can be problematic in property values on the ENCODE Portal. For example, the '/' character in an alias is a problem since the alias is an identifying property that can be used in a URL to view the record. In this case, the '/' will be interpreted as a path separator. Characters that get replaced: currently, just '/' with '-'. Args: txt: `str`. The value to clean. Returns: `str`. The cleaned value that is submission acceptable. """ txt = txt.replace("/","-").strip() # Replace contiguous spaces with a single space return space_reg.sub(" ", txt)
[docs] def get_vendor_id_from_encodeportal(self, pulsar_vendor_id): """ Given a Pulsar Vendor record ID, returns the upstream identifier. Raises: `UpstreamNotSet`: The Pulsar vendor.upstream_identifier attribute isn't set. """ if not pulsar_vendor_id: return "" vendor = models.Vendor(pulsar_vendor_id) upstream = vendor.upstream_identifier if not upstream: msg = "Vendor {} with Pulsar ID {} does not have the upstream_identifier attribute set.".format(vendor.name, vendor.id) raise UpstreamNotSet(msg) return upstream
[docs] def patch(self, payload, upstream_id, dont_extend_arrays=False): """ A wrapper over `encode_utils.connection.Connection.patch()`. Args: dont_extend_arrays: `bool`. Dynamic way to signal not to extend array property values. If not True, then the boolean value of `self.extend_arrays` determines whether arrays are extended. Returns: `dict`: The JSON response from the PATCH operation, or an empty dict if the record doesn't exist on the Portal. See ``encode_utils.connection.Connection.patch()`` for more details. """ payload[self.ENC_CONN.ENCID_KEY] = upstream_id if dont_extend_arrays: extend = False else: extend = self.extend_arrays response_json = self.ENC_CONN.patch(payload=payload, extend_array_values=extend) if not response_json: raise Exception("Couldn't PATCH record on the Portal since it doesn't exist.") return upstream_id
[docs] def post(self, payload, dcc_profile, pulsar_model, pulsar_rec_id): """ A wrapper over `encode_utils.connection.Connection.post()`. First checks if the Pulsar record has an upstream_identifier set, and if set, returns it rather than attempting to re-post. Adds aliases to the payload being the record's record ID and name. Sets the profile key in the payload. If the record is successfully posted to the prod ENCODE Portal, then sets the upstream_identifier attribute in the Pulsar record. Args: payload: `dict`. The new record attributes to submit. dcc_profile: `str`. The name of the ENCODE Profile for this record, i.e. 'biosample', 'genetic_modification'. pulsar_model: One of the defined subclasses of the ``models.Model`` class, i.e. ``models.Model.Biosample``, which will be used to set the Pulsar record's upstream_identifier attribute after a successful POST to the ENCODE Portal. pulsar_rec_id: `str`. The identifier of the Pulsar record to POST to the DCC. Returns: `str`: The upstream identifier for the new record on the ENCODE Portal, or the existing upstream identifier if the record already exists; see ``encode_utils.utils.get_record_id()`` for more details. """ payload[self.ENC_CONN.PROFILE_KEY] = dcc_profile pulsar_rec = pulsar_model(pulsar_rec_id) upstream = pulsar_rec.upstream_identifier if upstream: # May sure that upstream exists. Could be that the upstream identifier belongs to a different # server, i.e. test or a demo, than the one we are currently connected to. # No need to POST. if upstream.startswith("ENC"): # For sure this is a production accession and we should leave it alone. return upstream exists_on_server = self.ENC_CONN.get(rec_ids=upstream) if exists_on_server: return upstream aliases = payload.get("aliases", []) abbrev_alias = pulsar_rec.abbrev_id() if abbrev_alias not in aliases: aliases.append(abbrev_alias) # Add value of 'name' property as an alias, if this property exists for the given model. try: name = self.sanitize_prop_val(pulsar_rec.name) if name: # Need to prepend the model abbreviation to the name since some names are the same # between models. For example, its common in Pulsar to have a Library named the # same as the Biosample it belongs to. alias_name = models.Model.PULSAR_LIMS_PREFIX + pulsar_rec.MODEL_ABBR + "-" + name if alias_name not in aliases: aliases.append(alias_name) except KeyError: pass payload["aliases"] = aliases # `dict`. The POST response if the record didn't yet exist on the ENCODE Portal, or the # record JSON itself if it does already exist. Note that the dict. will be empty if the connection # object to the ENCODE Portal has the dry-run feature turned on. response_json = self.ENC_CONN.post(payload) upstream = euu.get_record_id(response_json) # Set value of the Pulsar record's upstream_identifier print("Setting the Pulsar record's upstream_identifier attribute to '{}'.".format(upstream)) pulsar_rec.patch(payload={"upstream_identifier": upstream}) print("upstream_identifier attribute set successfully.") return upstream
[docs] def get_biosample_term_name_and_type(self, biosample): """ Creates a dict. with the keys: biosample_term_name biosample_term_id biosample_type Args: biosample: `pulsarpy.models.Biosample` instance. Returns: `dict`. """ res = {} btn = models.BiosampleTermName(biosample.biosample_term_name_id) res["biosample_term_name"] = btn.name res["biosample_term_id"] = btn.accession bty = models.BiosampleType(biosample.biosample_type_id) res["biosample_type"] = bty.name return res
def get_exp_of_biosample(self, dcc_biosample_id): query_string = "?searchTerm={}&type=Experiment".format(dcc_biosample_id) experiments = self.ENC_CONN.search(url=query_string) if len(experiments) > 1: raise Exception("Expected to find 1 experiment linked to biosample {}, instead found more: {}.".format(dcc_biosample_id, len(experiments))) return experiments[0]["accession"]
[docs] def post_library_through_fastq(self, pulsar_library_id, dcc_exp_id, patch=False): """ POSTS the Biosample, it's latest Library, and all SequencingResults for that Library. Args: pulsar_library_id: `int`. The ID of a Pulsar Library record. dcc_exp_id: `int`. The ID of the experiment record on the Portal to link the replicate to. """ # POST biosample record pulsar_library = models.Library(pulsar_library_id) pulsar_biosample_id = pulsar_library.biosample_id biosample_upstream = self.post_biosample(pulsar_biosample_id, patch=patch) biosample = models.Biosample(pulsar_biosample_id) #if not dcc_exp_id: # dcc_exp_id = self.get_exp_of_biosample(biosample_upstream) # POST library record library_upstream = self.post_library(rec_id=pulsar_library.id, patch=patch) # POST replicate record replicate_upstream = self.post_replicate(pulsar_library_id=pulsar_library.id, dcc_exp_id=dcc_exp_id, patch=patch) # POST file records for all sequencing results for the Library sres_ids = pulsar_library.sequencing_result_ids if not sres_ids: msg = "No SequencingResult for Library {} of Biosample {}, exiting.".format(pulsar_biosample_id, pulsar_library.id) error_logger.error(msg) raise MissingSequencingResult(msg) for i in sres_ids: self.post_sres(pulsar_sres_id=i, enc_replicate_id=replicate_upstream, patch=patch)
[docs] def post_sres(self, pulsar_sres_id, enc_replicate_id, patch=False): """ A wrapper over ``self.post_fastq_file()``. Whereas ``self.post_fastq_file()`` only uploads the FASTQ file for the given read number, this method calls ``self.post_fastq_file()`` twice potentially, once for each FASTQ file in the Pulsar SequencingResult. Thus, if paired-end sequencing was done, ``self.post_fastq_file()`` will be called twice to upload the forward and reverse reads FASTQ files. """ sres = models.SequencingResult(pulsar_sres_id) srun = models.SequencingRun(sres.sequencing_run_id) sreq = models.SequencingRequest(srun.sequencing_request_id) self.post_fastq_file(pulsar_sres_id=sres.id, read_num=1, enc_replicate_id=enc_replicate_id, patch=patch) if not sreq.paired_end and sres.read2_uri: sres.patch({"paired_end": True}) if sreq.paired_end: # Submit read2 self.post_fastq_file(pulsar_sres_id=sres.id, read_num=2, enc_replicate_id=enc_replicate_id, patch=patch)
[docs] def check_if_biosample_has_exp_on_portal(self, dcc_biosample_id): """ Given a Portal biosample record ID, searches the Portal for associated experiment records. Any that are found are returned in a list. Args: dcc_biosample_id: `str`. A biosample record identifier on the Portal. Returns: `list` of associated experiment records, where each is JSON-serialized. Raises: `Exception`: The biosample is linked to more than one experiment. """ if not dcc_biosample_id: return False exps = self.ENC_CONN.get_experiments_with_biosample(rec_id=dcc_biosample_id) if exps: # Should only exist on one experiment. exps is an array of >= 0 experiment records. if len(exps) > 1: accessions = [] for i in exps: accessions.append(i["accession"]) msg = "Error: Biosample {} is associated to more than one Portal experiment: {}.".format(dcc_biosample_id, ", ".join(accessions)) raise Exception(msg) return exps[0] return False
[docs] def post_chipseq_ctl_exp(self, rec_id, wt_input=False, paired_input=False, exp_only=False, patch=False): """ Creates a control experiment record on the ENCODE Portal for either the paired-input control biosample(s) or the wild-type input biosample on the Pulsar ChipseqExperiment. Args: rec_id: `int`. ID of a ChipseqExperiment record in Pulsar. wt_input: `bool`. True means to make a control experiment on the Portal for the wild-type input biosample on the Pulsar ChipseqExperiment. Note that either this or the `paired_input` parameter must be set to True and not both. paired_input: `bool`. True means to make a control experiment on the Portal for the paired-input control biosample(s) on the Pulsar ChipseqExperiment. Note that either this or the `wild_type` parameter must be set to True and not both. exp_only: `bool`. Only makes sense to use when the `patch` parameter is set to True. When `exp_only=True`, then don't PATCH Biosample records and everything downstream to the file records on the Portal (don't call `self.post_library_through_fastq()`). Returns: `str`: The ENCODE Portal accession of the control experiment. Raises: `ValueError`: Both parameters `wt_input` and `paired_input` are set to False or True. Only one of them must be True. """ print(">>> IN post_chipseq_ctl_exp()") if (not wt_input and not paired_input) or (wt_input and paired_input): raise ValueError("Either the wt_input or the paired_input parameter must be set to True.") pulsar_exp = models.ChipseqExperiment(rec_id) input_ids = [] if wt_input: experiment_type = "wild type" # Only 1 Wild Type input per experiment. if pulsar_exp.wild_type_control_id: input_ids.append(pulsar_exp.wild_type_control_id) else: experiment_type = "paired-input" # Normally there will only be one paired_input control Biosample, but there could at times # be another. That happens when one of the reps fail, and another rep has to be made from a # different cell batch than the sibling rep on the experiment. input_ids.extend(pulsar_exp.control_replicate_ids) # Biosample records. if not input_ids: msg = "Can't submit {} control exp. for {}: no replicates.".format(experiment_type, pulsar_exp.abbrev_id()) log_error(msg) raise ExpMissingReplicates(msg) inputs = [models.Biosample(x) for x in input_ids] dcc_exp = "" for i in inputs: dcc_exp = self.check_if_biosample_has_exp_on_portal(i.upstream_identifier) if dcc_exp: break payload = {} alias = "" for i in inputs: alias += i.abbrev_id() alias.rstrip() if wt_input: alias_prefix = "pWT-CTL_" else: alias_prefix = "pPI-CTL_" alias = alias_prefix + alias payload["aliases"] = [alias] payload.update(self.get_exp_core_payload_props(pulsar_exp_rec=pulsar_exp, assay_term_name="ChIP-seq")) payload["description"] = "ChIP-seq on human " + payload["biosample_term_name"] payload["target"] = "Control-human" # Before POSTING experiment, check if it already exists on the Portal. # POST experiment. Don't use self.post() since there isn't a Pulsar model for a control experiment. # So, use encode-utils directly to POST. if patch: if not dcc_exp: msg = "Can't PATCH " + alias + " control experiment since it wasn't found on the Portal." raise Exception(msg) ctl_exp_accession = self.patch(payload=payload, upstream_id=dcc_exp["accession"]) else: # post if not dcc_exp: payload[self.ENC_CONN.PROFILE_KEY] = "experiment" dcc_exp = self.ENC_CONN.post(payload=payload) ctl_exp_accession = dcc_exp["accession"] if (patch and not exp_only) or not patch: for b in input_ids: self.post_library_through_fastq(pulsar_biosample_id=b, dcc_exp_id=ctl_exp_accession, patch=patch) return ctl_exp_accession
[docs] def post_bulk_atacseq_exp(self, rec_id, patch=False, patch_all=False): """ Args: rec_id: `int`. ID of an AtacSeq experiment record in Pulsar. Should be a bulk and not a single-cell experiment. patch: `bool`. True means to patch the DCC experiment record. patch_all: `bool`. True means to patch not just the experiment record, but its sub-entities also, i.e. biosamples, libraries, replicates, ... Setting this to True automatically sets `patch` to True as well. Returns: """ if patch_all: patch = True pulsar_exp = models.Atacseq(rec_id) pulsar_exp_upstream = pulsar_exp.upstream_identifier payload = {} payload.update(self.get_exp_core_payload_props(pulsar_exp_rec=pulsar_exp, assay_term_name="ATAC-seq")) desc = pulsar_exp.description.strip() if desc: payload["description"] = desc # submit experiment if patch: dcc_exp_accession = self.patch(payload=payload, upstream_id=pulsar_exp_upstream) if patch_all or not patch: dcc_exp_accession = self.post(payload=payload, dcc_profile="experiment", pulsar_model=models.Atacseq, pulsar_rec_id=rec_id) self.post_experimental_reps(rec_id=rec_id, experiment_type="atac-seq", patch=patch) return dcc_exp_accession
[docs] def post_chipseq_exp(self, rec_id, patch=False): """ Args: rec_id: `int`. ID of a ChipseqExperiment record in Pulsar. Returns: `str`: The ENCODE Portal accession of the control experiment. Raises: `ValueError`: Both parameters `wt_input` and `paired_input` are set to False or True. Only one of them must be True. """ pulsar_exp = models.ChipseqExperiment(rec_id) pulsar_exp_upstream = pulsar_exp.upstream_identifier payload = {} payload.update(self.get_exp_core_payload_props(pulsar_exp_rec=pulsar_exp, assay_term_name="ChIP-seq")) target = models.Target(pulsar_exp.target_id) target_upstream = target.upstream_identifier if not target_upstream: msg = "Target {} missing upstream identifier.".format(target.abbrev_id()) log_error(msg) raise MissingTargetUpstream(msg) payload["target"] = target.upstream_identifier #payload["description"] = pulsar_exp.description.strip() payload["description"] = target.upstream_identifier.rstrip('-human') + ' ChIP-seq on human ' + payload["biosample_term_name"] # submit experiment if patch: dcc_exp_accession = self.patch(payload=payload, upstream_id=pulsar_exp_upstream) else: dcc_exp_accession = self.post(payload=payload, dcc_profile="experiment", pulsar_model=models.ChipseqExperiment, pulsar_rec_id=rec_id) # Then POST WT-input and paired-input control experiments. The WT-input is shared across # multiple experiments from the same starting batch, so it's possible that it was POSTED # already during submission of a related experiment. self.post_chipseq_control_experiments(rec_id=rec_id) # POST experimental biosampes self.post_experimental_reps(rec_id=rec_id, experiment_type="chip-seq") # Add-in/PATCH possible_controls property self.patch_chipseq_possible_controls(pulsar_exp.id) return dcc_exp_accession
def patch_chipseq_possible_controls(self, pulsar_exp_id): possible_controls = self.get_chipseq_possible_controls(pulsar_exp_id) payload = {} payload["possible_controls"] = possible_controls exp = models.ChipseqExperiment(pulsar_exp_id) self.patch(payload=payload, upstream_id=exp.upstream_identifier, dont_extend_arrays=True) def get_chipseq_possible_controls(self, pulsar_exp_id): possible_controls = [] exp = models.ChipseqExperiment(pulsar_exp_id) wt = models.Biosample(exp.wild_type_control_id) wt_upstream = wt.upstream_identifier wt_ctl_exp = self.check_if_biosample_has_exp_on_portal(wt_upstream) if not wt_ctl_exp: raise Exception("WT input {} on ChipseqExperiment {} doesn't have an upstream control experiment record.".format(wt.abbrev_id(), exp.abbrev_id())) possible_controls.append(wt_ctl_exp["accession"]) pis = [models.Biosample(x) for x in exp.control_replicate_ids] for i in pis: upstream = i.upstream_identifier pi_ctl_exp = self.check_if_biosample_has_exp_on_portal(upstream) if not pi_ctl_exp: raise Exception("Paired input {} on ChipseqExperiment {} doesn't have an upstream control experiment record.".format(i.abbrev_id(), exp.abbrev_id())) possible_controls.append(pi_ctl_exp["accession"]) return list(set(possible_controls))
[docs] def post_chipseq_control_experiments(self, rec_id): """ POSTS the WT input and the paired input controls that are associated to the indicated ChipseqExperiment in Pulsar, turning each into an experiment record on the Portal. Args: rec_id: `int`. ID of a ChipseqExperiment record in Pulsar. """ print(">>> IN post_chipseq_control_experiments()") # First the WT-input: self.post_chipseq_ctl_exp(rec_id=rec_id, wt_input=True) # Then the Paired-input, which is unique to this experiment. self.post_chipseq_ctl_exp(rec_id=rec_id, paired_input=True)
[docs] def post_experimental_reps(self, rec_id, experiment_type, patch=False): """ POSTS the experimental replicates of a ChipseqExperiment or bulk Atacseq experiment object. Args: rec_id: `int`. ID of a ChipseqExperiment record in Pulsar. experiment_type: `str`. Either of chip-seq or atac-seq. """ if experiment_type == "chip-seq": pulsar_exp = models.ChipseqExperiment(rec_id) elif experiment_type == "atac-seq": pulsar_exp = models.Atacseq(rec_id) else: raise Exception("Unknown experiment type '{}' passed to experiment_type parameter.".format(experiment_type)) rep_ids = pulsar_exp.replicate_ids for i in rep_ids: self.post_library_through_fastq(pulsar_library_id=i, dcc_exp_id=pulsar_exp.upstream_identifier, patch=patch)
[docs] def get_exp_core_payload_props(self, pulsar_exp_rec, assay_term_name): """ Args: pulsar_exp_rec: `str`. `pulsarpy.models` subclass being either ChipSeq or Atacseq. assay_term_name: `str`. Either 'ChIP-seq' or ATAC-seq. """ payload = {} first_rep_library = models.Library(pulsar_exp_rec.replicate_ids[0]) first_rep_biosample = models.Biosample(first_rep_library.biosample_id) # Add biosample_term_name, biosample_term_id, and biosample_type props btn = models.BiosampleTermName(first_rep_biosample.biosample_term_name_id) bty = models.BiosampleType(first_rep_biosample.biosample_type_id) payload["biosample_ontology"] = self.ENC_CONN.get_biosample_type(classification=bty.name, term_id=btn.accession)["@id"] payload["assay_term_name"] = assay_term_name payload["documents"] = self.post_documents(pulsar_exp_rec.document_ids) payload["experiment_classification"] = ["functional genomics assay"] submitter_comment = pulsar_exp_rec.submitter_comments.strip() if submitter_comment: payload["submitter_comment"] = submitter_comment return payload
def post_crispr_modification(self, rec_id, patch=False): rec = models.CrisprModification(rec_id) # CrisprConstruct(s) cc_ids = rec.crispr_construct_ids ccs = [models.CrisprConstruct(i) for i in cc_ids] # DonorConstruct dc = models.DonorConstruct(rec.donor_construct_id) dc_target = models.Target(dc.target_id) target_upstream = dc_target.upstream_identifier if not target_upstream: msg = "Target {} missing upstream identifier.".format(dc_target.abbrev_id()) log_error(msg) raise MissingTargetUpstream(msg) payload = {} payload["category"] = rec.category # Required desc = rec.description.strip() if desc: payload["description"] payload["documents"] = self.post_documents(rec.document_ids) guide_seqs = list(c.guide_sequence for c in ccs) payload["guide_rna_sequences"] = guide_seqs if rec.category in ["insertion", "replacement"]: pass # The insert can be viewed in addgene. This doesn't look good to show on the Portal. #payload["introduced_sequence"] = dc.insert_sequence.upper() payload["method"] = "CRISPR" # Required payload["modified_site_by_target_id"] = dc_target.upstream_identifier payload["purpose"] = rec.purpose # Required # Note that CrisprConstruct can also has_many construct_tags. Those are not part of the donor # insert though. construct_tags = [models.ConstructTag(i) for i in dc.construct_tag_ids] construct_tag_names = [x.name for x in construct_tags] seen_tags = [] introduced_tags = [] for tag in construct_tag_names: if tag.startswith("eGFP"): # Pulsar has two eGFP tags that differ by the linker sequence: # 1) eGFP (MH170480) # 2) eGFP (MH170481) # The Portal, however, only has eGFP and it makes most sense to submit this as # simply eGFP and mention the linker used elsewhere. tag = "eGFP" if tag not in seen_tags: # Avoid potential for duplicate tags, which are not allowed of course on Portal. seen_tags.append(tag) introduced_tags.append({"name": tag, "location": "C-terminal"}) if not introduced_tags: # tags are required for modifications on the Portal. introduced_tags = [{"name": "eGFP", "location": "C-terminal"}] payload["introduced_tags"] = introduced_tags reagents = [] for i in ccs + [dc]: addgene_id = getattr(i, "addgene_id") if addgene_id: r = {} r["source"] = "addgene" r["url"] = "http://www.addgene.org/" + addgene_id r["identifier"] = addgene_id reagents.append(r) if reagents: payload["reagents"] = reagents # ex: ENCGM094ZOS if patch: upstream_id = self.patch(payload=payload, upstream_id=rec.upstream_identifier) else: upstream_id = self.post(payload=payload, dcc_profile="genetic_modification", pulsar_model=models.CrisprModification, pulsar_rec_id=rec_id) return upstream_id def post_document(self, rec_id, patch=False): rec = models.Document(rec_id) payload = {} desc = rec.description.strip() if desc: payload["description"] = rec.description doc_type = models.DocumentType(rec.document_type_id) payload["document_type"] = doc_type.name content_type = rec.content_type # Create attachment for the attachment prop file_contents = rec.download() data = base64.b64encode(file_contents) temp_uri = str(data, "utf-8") href = "data:{mime_type};base64,{temp_uri}".format(mime_type=content_type, temp_uri=temp_uri) attachment = {} attachment["download"] = rec.name attachment["type"] = content_type attachment["href"] = href payload["attachment"] = attachment if patch: upstream_id = self.patch(payload, rec.upstream_identifier) else: upstream_id = self.post(payload=payload, dcc_profile="document", pulsar_model=models.Document, pulsar_rec_id=rec_id) return upstream_id def post_documents(self, rec_ids, patch=False): upstreams = [] for i in rec_ids: upstreams.append(self.post_document(rec_id=i, patch=patch)) return upstreams def post_treatments(self, rec_ids, patch=False): upstreams = [] for i in rec_ids: upstreams.append(self.post_treatment(rec_id=i, patch=patch)) return upstreams def post_treatment(self, rec_id, patch=False): rec = models.Treatment(rec_id) payload = {} conc = rec.concentration if conc: payload["amount"] = conc conc_unit = models.Unit(rec.concentration_unit_id) payload["amount_units"] = conc_unit.name duration = rec.duration if duration: payload["duration"] = duration payload["duration_units"] = rec.duration_units temp = rec.temperature_celsius if temp: payload["temperature"] = temp payload["temperature_units"] = "Celsius" ttn = models.TreatmentTermName(rec.treatment_term_name_id) payload["treatment_term_id"] = ttn["accession"] payload["treatment_term_name"] = ttn["name"] payload["treatment_type"] = rec.treatment_type payload["documents"] = self.post_documents(rec.document_ids) # Submit if patch: upstream_id = self.patch(pyaload, rec.upstream_identifier) else: upstream_id = self.post(payload=payload, dcc_profile="treatment", pulsar_model=models.Treatment, pulsar_rec_id=rec_id) return upstream_id
[docs] def post_vendor(self, rec_id, patch=False): """ Vendors must be registered directly by the DCC personel. """ raise Exception("Vendors must be registered directly by the DCC personel.")
[docs] def get_gel_lane_with_biosample(self, immunoblot_id, biosample_id): """ Given an Immunoblot record ID, and a Biosample record ID, returns the GelLane object with the given Biosample. This method assumes that a Gel won't have more than one GelLane with the same Biosample. Note that there should only be 1 Gel, even though the Rails Immunoblot model allows many - on the 'to fix list'. Args: immunoblot_id: `int`. Immunoblot record ID. biosample_id: `int`. Biosample record ID. Returns: `None` if the GelLane didn't pass. Otherwise, a `pulsarpy.models.GelLane` instance. Raises: `IpLaneException`: One of multiple issues that could be present as indicated by the error message, i.e. * The Biosample doesn't have an associated Gel * There isn't a GelLane with the Biosample on it. """ ip = models.Immunoblot(immunoblot_id) if not ip.gel_ids: raise IpLaneException("IP {} for Biosample {} does not have a Gel.".format(ip.id, biosample_id)) gel = models.Gel(ip.gel_ids[0]) gl = "" # GelLane for gel_lane_id in gel.gel_lane_ids: gel_lane = models.GelLane(gel_lane_id) if biosample_id == gel_lane.biosample_id: gl = gel_lane if not gl: raise IpLaneException("Could't find a GelLane that has Biosample {} on Immunoblot {}.".format(biosample_id, immunoblot_id)) if not gl.attrs["pass"]: print("GelLane didn't pass for Biosample {}.".format(biosample_id)) return None return gl
[docs] def post_ip_biosample_characterization(self, immunoblot_id, biosample_id, patch=False): """ Submits a Pulsar Immunoblot for a specific lane (biosample) on a Gel to the ENCODE biosample_characterization profile. Such an immunoblot is used to show whether the eGFP-tagged target (using CRISPR) is expressed (has a band in the size range of the expected taget size). Only submit these after the ChipSeq experiment (and hence CrisprModification) has been submitted. Even though some Biosamples have a successful IP, they don't all need to be submitted. For example, in one case a Biosample was lost after a successful IP and hence couldn't do the crosslinking for ChIP later on. Another reason may be that we already have enough validated Biosamples to submit. This method makes the assumption that a given gel won't have more than one lane with the same Biosample. Returns: `None`: The Biosmaple isn't already registered on the Portal. `None`: The Biosample has an IP, but not one that passes (based on the GelLane.pass attribute) `None`: The Non-WT Biosample isn't yet registerd on the Portal `None`: The non-WT biosample that doesn't have a ChipSeq object. `int`: The ID of the created biosample_characterization record on the Portal. """ GEL_IMAGE_DIR = os.path.join(os.path.curdir, "gel_images") if not os.path.exists(GEL_IMAGE_DIR): os.mkdir(GEL_IMAGE_DIR) biosample = models.Biosample(biosample_id) if not biosample.upstream_identifier: # For now, don't sumbit until jadrian says otherwise. print("Biosample missing upstream - skipping.") return None ip = models.Immunoblot(immunoblot_id) gl = self.get_gel_lane_with_biosample(immunoblot_id=immunoblot_id, biosample_id=biosample_id) if not gl: return None payload = {} if not biosample.wild_type: # Find WT parent that has an associated Immunoblot to use as control # and set that Biosample's upstream_identifier as the value of the ENCODE property # biosample_characterization.wildtype_biosample. Currently, the WT biosample is determined # soley by biosample_term_name. biosample_term_name = models.BiosampleTermName(biosample.biosample_term_name_id).name if biosample_term_name == "A549": wt_biosample_id = 2551 elif biosample_term_name == "GM23338": wt_biosample_id = 2559 elif biosample_term_name == "HepG2": wt_biosample_id = 2510 elif biosample_term_name == "MCF-7": wt_biosample_id = 2515 elif biosample_term_name == "SK-N-SH": wt_biosample_id = 11200 else: msg = "Can't submit IP biosample_characterization for Biosample {} IP {} since the wild type biosample with its own Immunoblot can't be determined for biosample term name {}.".format(biosample.id, immunoblot_id, biosample_term_name) error_logger.error(msg) return None wt_biosample = models.Biosample(wt_biosample_id) wt_biosample_upstream = wt_biosample.upstream_identifier if not wt_biosample_upstream: print("POSTING WT parent Biosample.") wt_biosample_upstream = self.post_biosample(wt_biosample_id) # Then POST the Immunoblot linked to the WT Parent Biosample. Note that it's possible # but unlikely for a Biosample to be linked to multiple Immunoblots. In that case, the # first one will be submitted. wt_ip_id = wt_biosample.immunoblot_ids[0] print("POSTING WT parent Biosample's Immunoblot.") self.post_ip_biosample_characterization(immunoblot_id=wt_ip_id, biosample_id=wt_biosample_id, patch=False) payload["wildtype_biosample"] = wt_biosample_upstream payload["characterization_method"] = "immunoblot" payload["characterizes"] = biosample.upstream_identifier payload["documents"] = self.post_documents(ip.document_ids) payload["review"] = {"lab": "richard-myers", "lane": gl.lane_number} # Process attachment property for gel image. # A Pulsar Gel object can have many GelImages (different exposure times), but Jess has indicated # to take the first one if multiple are present. gel = models.Gel(gl.gel_id) if not gel.gel_image_ids: msg = "GelLane {} of Gel {} for Biosample {} is missing a GelImage.".format(gl.id, gel.id, biosample_id) error_logger.error(msg) raise IpLaneException(msg) gel_image = models.GelImage(sorted(gel.gel_image_ids)[0]) # The image URI is expected to have public read permission. # Some paths store a // at the beginning to tell the browser to use the same protocol as it's currently # using (HTTP/HTTPS). In that case, just prefix it with 'https:'. image_uri = gel_image.image if image_uri.startswith("//"): image_uri = "https:" + image_uri image_basename = os.path.basename(image_uri) image_exists_locally = os.path.join(GEL_IMAGE_DIR, image_basename) if not os.path.exists(image_exists_locally): # Then download it stream = requests.get(image_uri, stream=True) fout = open(image_exists_locally, 'wb') for line in stream.iter_content(chunk_size=512): fout.write(line) fout.close() payload["attachment"] = {"path": image_exists_locally} # Caption btn = models.BiosampleTermName(biosample.biosample_term_name_id).name caption = "Immunoprecipitation was performed on nuclear extracts from biosample {}".format(biosample.upstream_identifier) if biosample.wild_type: caption += " ({} wild type)".format(btn) else: if not biosample.chipseq_experiment_ids: msg = "Biosample {} is not linked to any ChipSeq experiments.".format(biosample_id) error_logger.error(msg) return None #raise IpLaneException(msg) elif len(biosample.chipseq_experiment_ids) > 1: msg = "Biosample {} is linked to more than 1 ChipSeq experiment. It is not known as to which one this IP relates.".format(biosample_id) error_logger.error(msg) raise IpLaneException(msg) chipseq_exp = models.ChipseqExperiment(biosample.chipseq_experiment_ids[0]) if not chipseq_exp.upstream_identifier: msg = "ChipSeq experiment {} for Biosample {} needs to be submitted prior to submitting the IP biosample_characterization.".format(chipseq_exp.id, biosample_id) error_logger.error(msg) raise IpLaneException(msg) crispr_modification = models.CrisprModification(biosample.crispr_modification_id) if not crispr_modification.upstream_identifier: msg = "Biosample {} has a CrisprModification, but it isn't yet registered on the Portal.".format(biosample_id) error_logger.error(msg) raise IpLaneException(msg) crispr_construct = models.CrisprConstruct(crispr_modification.crispr_construct_ids[0]) target = models.Target(crispr_construct.target_id) # Get biosample_replicate_number on experiment in Portal rep_hash = encode_utils.replicate.ExpReplicates(self.ENC_CONN, chipseq_exp.upstream_identifier).rep_hash brn = rep_hash[biosample.upstream_identifier]["brn"] caption += " ({} eGFP-{} replicate {})".format(btn, target.name, brn) caption += " cells using anti-eGFP antibody. The image shows western blot analysis of input" caption += " (lane 1)," if not biosample.wild_type: caption += " immunoprecipitate (lane 2), and mock immunoprecipitate using IgG (lane 3)." else: caption += " and immunoprecipitate (lane 2)." caption += " Molecular weight standard (Bio-Rad, cat. # 161-0374) contains 10" caption += " pre-stained recombinant proteins (250, 150, 100, 75, 50, 37, 25, 20, 15, and 10 kD)." if not biosample.wild_type: caption += " The target molecular weight is {} kD as indicated with an arrow.".format(gl.expected_product_size) if gl.low_target_band_intensity: caption += " Lower size bands which may be due to potential degradation products are marked with asterisks." payload["caption"] = caption submitter_comment = ip.submitter_comments if submitter_comment: payload["submitter_comment"] = submitter_comment # Submit payload if patch: upstream_id = self.patch(payload, gl.upstream_identifier) else: upstream_id = self.post(payload=payload, dcc_profile="biosample_characterization", pulsar_model=models.GelLane, pulsar_rec_id=gl.id) return upstream_id
def post_biosample(self, rec_id, patch=False): rec = models.Biosample(rec_id) # The alias lab prefixes will be set in the encode_utils package if the DCC_LAB environment # variable is set. payload = {} # Add biosample_term_name, biosample_term_id, biosample_type props btn = models.BiosampleTermName(rec.biosample_term_name_id) bty = models.BiosampleType(rec.biosample_type_id) payload["biosample_ontology"] = self.ENC_CONN.get_biosample_type(classification=bty.name, term_id=btn.accession)["@id"] date_biosample_taken = rec.date_biosample_taken if date_biosample_taken: if bty.name == "tissue": payload["date_obtained"] = date_biosample_taken else: payload["culture_harvest_date"] = date_biosample_taken desc = rec.description.strip() if desc: payload["description"] = desc donor = models.Donor(rec.donor_id) donor_upstream = donor.get_upstream() if not donor_upstream: raise Exception("Donor '{}' of biosample '{}' does not have its upstream set. Donors must be registered with the DCC directly.".format(donor.id, rec_id)) payload["donor"] = donor_upstream lot_id = rec.lot_identifier if lot_id: payload["lot_id"] = lot_id nih_cert = rec.nih_institutional_certification if nih_cert: payload["nih_institutional_certification"] = nih_cert payload["organism"] = "human" passage_number = rec.passage_number if passage_number: payload["passage_number"] = passage_number starting_amount = rec.starting_amount if starting_amount: payload["starting_amount"] = starting_amount payload["starting_amount_units"] = models.Unit(rec.starting_amount_units_id).name submitter_comment = rec.submitter_comments if submitter_comment: payload["submitter_comment"] = submitter_comment preservation_method = rec.tissue_preservation_method if preservation_method: payload["preservation_method"] = preservation_method prod_id = rec.vendor_product_identifier if prod_id: payload["product_id"] = prod_id cm_id = rec.crispr_modification_id if cm_id: payload["genetic_modifications"] = [self.post_crispr_modification(cm_id)] payload["documents"] = self.post_documents(rec.document_ids) part_of_biosample_id = rec.part_of_id if part_of_biosample_id: part_of_biosample = models.Biosample(part_of_biosample_id) pob_upstream = part_of_biosample.get_upstream() if not pob_upstream or not pob_upstream.startswith("ENCBS"): pob_upstream = self.post_biosample(part_of_biosample_id) payload["part_of"] = pob_upstream pooled_from_biosample_ids = rec.pooled_from_biosample_ids if pooled_from_biosample_ids: pooled_from_biosamples = [models.Biosample(p) for p in pooled_from_biosample_ids] payload["pooled_from"] = [] for p in pooled_from_biosamples: p_upstream = p.get_upstream() if not p_upstream: p_upstream = self.post_biosample(p.id) payload["pooled_from"].append(p_upstream) if rec.vendor_id: payload["source"] = self.get_vendor_id_from_encodeportal(rec.vendor_id) else: payload["source"] = "michael-snyder" payload["treatments"] = self.post_treatments(rec.treatment_ids) if patch: upstream_id = self.patch(payload, rec.upstream_identifier) else: upstream_id = self.post(payload=payload, dcc_profile="biosample", pulsar_model=models.Biosample, pulsar_rec_id=rec_id) return upstream_id
[docs] def post_library(self, rec_id, patch=False): """ This method will check whether the biosample associated to this library is submitted. If it isn't, it will first submit the biosample. """ rec = models.Library(rec_id) payload = {} biosample = models.Biosample(rec.biosample_id) # If this Library record is a SingleCellSorting.library_prototype, then the Biosample it will # be linked to is the SingleCellSorting.sorting_biosample. payload["biosample"] = biosample.upstream_identifier payload["documents"] = self.post_documents(rec.document_ids) fragmentation_method_id = rec.library_fragmentation_method_id if fragmentation_method_id: fragmentation_method = models.LibraryFragmentationMethod(fragmentation_method_id) payload["fragmentation_methods"] = [fragmentation_method.name] payload["lot_id"] = rec.lot_identifier payload["nucleic_acid_term_name"] = models.NucleicAcidTerm(rec.nucleic_acid_term_id).name payload["product_id"] = rec.vendor_product_identifier payload["size_range"] = rec.size_range payload["strand_specificity"] = bool(rec.strand_specific) if rec.vendor_id: payload["source"] = self.get_vendor_id_from_encodeportal(rec.vendor_id) else: payload["source"] = "michael-snyder" ssc_id = rec.single_cell_sorting_id if ssc_id: barcode_details = self.get_barcode_details_for_ssc(ssc_id=ssc_id) payload["barcode_details"] = barcode_details # Submit payload if patch: upstream_id = self.patch(payload, rec.upstream_identifier) else: upstream_id = self.post(payload=payload, dcc_profile="library", pulsar_model=models.Library, pulsar_rec_id=rec_id) return upstream_id
[docs] def post_replicate(self, pulsar_library_id, dcc_exp_id, patch=False): """ Submits a replicate record, linked to the specified library and experiment. First, replicates on the experiment will be searched to see if a replicate already exists for a specifc biosample and library combination, and if so then that repicate's JSON from the ENCODE Portal is returned. If the associated experiment is ChIP-seq, and isn't a control experpiment, then the replicate will be submitted with a link to antibody ENCAB728YTO (AB-9 in Pulsar), which is the GFP-specific antibody used to pull down GFP-tagged TFs. Args: pulsar_library_id: `int`. The ID of a Library record in Pulsar. dcc_exp_id: `int`. The ID of the experiment record on the Portal to link the replicate to. Returns: `str`: The replicate.uuid property value of the record on the ENCODE Portal. """ # Required fields to submit to a replicate are: # -biological_replicate_number # -experiment # -technical_replicate_number #dcc_lib = self.ENC_CONN.get(ignore404=False, rec_ids=dcc_library_id) print(">>> In dcc_submit.post_replicate()") payload = {} lib = models.Library(pulsar_library_id) payload["library"] = lib.upstream_identifier biosample_id = lib.biosample_id biosample = models.Biosample(biosample_id) payload["aliases"] = [biosample.upstream_identifier + "-" + lib.upstream_identifier] payload["experiment"] = dcc_exp_id dcc_exp = self.ENC_CONN.get(rec_ids=dcc_exp_id) # Check if replicate already exists for this library exp_reps_instance = encode_utils.replicate.ExpReplicates(self.ENC_CONN, dcc_exp_id) rep_json = exp_reps_instance.get_rep(biosample_accession=biosample.upstream_identifier, library_accession=lib.upstream_identifier) brn = "" trn = "" if rep_json and not patch: return rep_json["uuid"] elif rep_json: brn = rep_json["biological_replicate_number"] trn = rep_json["technical_replicate_number"] else: # Then there isn't a replicate yet for this library, and maybe not even the biosample. if not biosample.upstream_identifier in exp_reps_instance.rep_hash: brn = exp_reps_instance.suggest_brn() trn = 1 else: brn = exp_reps_instance.rep_hash[biosample.upstream_identifier]["brn"] trn = exp_reps_instance.suggest_trn(biosample.upstream_identifier) if dcc_exp["assay_term_name"] == "ChIP-seq": # Only add antibody if not replicate on control experiment if not dcc_exp["target"]["uuid"] == "89839f28-ad35-4bb4-a214-ee65d0a97d8d": # Control-human target payload["antibody"] = "ENCAB728YTO" #AB-9 in Pulsar #payload["aliases"] = # Set biological_replicate_number and technical_replicate_number. For ssATAC-seq experiments, # these two attributes don't really make sense, but they are required to submit, so ... payload["biological_replicate_number"] = brn payload["technical_replicate_number"] = trn # Submit payload if patch: upstream_id = self.patch(payload, rep_json["uuid"]) else: # POST to ENCODE Portal. Don't use post() method defined here that is a wrapper over # `encode_utils.connection.Connection.post`, since the wrapper only works if the record we # are POSTING has a corresponding record type on the Portal. Since Pulsar doesn't have a # corresponding replicate model, we can't use the wrapper method. payload[self.ENC_CONN.PROFILE_KEY] = "replicate" res_json = self.ENC_CONN.post(payload=payload) upstream_id = euu.get_record_id(res_json) return upstream_id
def upload_fastq_files(self, dcc_biosample_id): pulsar_bio = models.Biosample(upstream_id=dcc_biosample_id)
[docs] def post_fastq_file(self, pulsar_sres_id, read_num, enc_replicate_id, patch=False): """ Creates a file record on the ENCODE Portal. Checks the SequencingResult in Pulsar to see where the file is stored. If stored in DNAnexus, the file will be downloaded locally into the directory given by ``pulsarpy_to_encodedcc.FASTQ_FOLDER`` (the download folder will be checked first to see if the file was previously downloaded before attempting to download. After the file object is created on the ENCODE Portal, it's accession will be stored as the upstream identifier in the Pulsar SequencingResult record for the given read. Thus, if a file object was creatd for a R1 FASTQ file, then the `SequencingResult.read1_upstream_identifier` attribute is updated. If instead a file object was created for a R2 FASTQ file, then the `SequencingResult.read2_upstream_identifier`` attribute is updated. Some rather complex logic is used to determine the control FASTQ files when submitting an experimental replicate's FASTQ file. If the Biosample associated with the SequencingResult is part of a ChipseqExperiment, then the control biosamples consist of the paired input(s) and the wild type input, which in Pulsar are given the attribute names `ChipseqExperiment.control_replicates` and `ChipseqExperiment.wild_type_control`. A non-control file object on the ENCODE Portal needs to have the ``controlled_by`` property set, which points to one or more control FASTQ file accessions on the ENCODE Portal. We normally submit them by matching read numbers, so if the file object we are creating is for a R1 FASTQ file, then all the controlled_by accessions are also R1 FASTQ files. The challenge is in knowing which SequencingResult set to use for control FASTQ files. Since a Biosample can have multiple Libraries, which can have multiple SequencingRequests, which can have multiple SequencingRuns, there can be many sets of SequencingResults. However, since in most cases there will only be one of each, the approach taken here is to use the SequencingResults of the latest SequencingRun of the latest SequencingRequest. Once this simplicity fails to hold, an updated approach will need to be taken. If you have alreay created the file record on the Portal and for some reason the FASTQs didn't upload, you can try to reupload the FASTQs by calling this method with patch equal to False. Args: pulsar_sres_id: A SequencingResult record in Pulsar. read_num: `int`. being either 1 or 2. Use 1 for the forwrard reads FASTQ file, and 2 for the reverse reads FASTQ file. A SequencingResult in Pulsar stores the location of both files (if paired-end sequening). end_replicate_id: `str`. The identifier of the DCC replicate record that the file record is to be associated with. Returns: `dict`. The response from the encode-utils POST or PATCH operation. """ sres = models.SequencingResult(pulsar_sres_id) lib = models.Library(sres.library_id) bio = models.Biosample(lib.biosample_id) srun = models.SequencingRun(sres.sequencing_run_id) sreq = models.SequencingRequest(srun.sequencing_request_id) platform = models.SequencingPlatform(sreq.sequencing_platform_id) payload = {} payload["aliases"] = [] payload["read_length"] = 100 payload["file_format"] = "fastq" payload["output_type"] = "reads" # The Pulsar SequencingPlatform must already have the upstream_identifier attribute set. payload["platform"] = platform.upstream_identifier # set flowcell_details flowcell_details = {} flowcell_details["barcode"] = lib.get_barcode_sequence() flowcell_details["lane"] = str(srun.lane) payload["flowcell_details"] = [flowcell_details] payload["replicate"] = enc_replicate_id #if sreq.paired_end: # payload["run_type"] = "paired-ended" #else: # payload["run_type"] = "single-ended" payload["run_type"] = "paired-ended" if read_num == 1: payload["paired_end"] = "1" file_uri = sres.read1_uri upstream_id = sres.read1_upstream_identifier read_count = sres.read1_count elif read_num == 2: payload["paired_end"] = "2" file_uri = sres.read2_uri upstream_id = sres.read2_upstream_identifier read_count = sres.read2_count # Need to set paired_with key in the payload. In this case, # it is expected that R1 has been already submitted. if not sres.read1_upstream_identifier: raise Exception("Can't set paired_with for SequencingResult {} since R1 doesn't have an upstream set.".format(sres.id)) payload["paired_with"] = sres.read1_upstream_identifier if not file_uri: raise NoFastqFile("SequencingResult '{}' for R{} does not have a FASTQ file path set.".format(pulsar_sres_id, read_num)) elif not upstream_id and patch: raise Exception("Can't PATCH file object on the Portal when the SequencingResult {} for read {} doesn't have an upstream ID set.".format(pulsar_sres_id, read_num)) data_storage = models.DataStorage(srun.data_storage_id) data_storage_provider = models.DataStorageProvider(data_storage.data_storage_provider_id) # Initialize file_path to be empty string. file_path = "" if data_storage_provider.name == "DNAnexus": dx_file = dxpy.DXFile(dxid=file_uri) file_path = os.path.join(FASTQ_FOLDER, dx_file.name) # Check if file exists and is non-empty in download directory before attempting to download. if not patch: if not os.path.exists(file_path) or not os.path.getsize(file_path): # Download file. dxpy.download_dxfile(dxid=file_uri, filename=file_path, show_progress=True) file_ref = "dnanexus${}".format(file_uri) payload["aliases"].append(file_ref) payload["aliases"].append(dx_file.name) elif data_storage_provider == "AWS S3 Bucket": file_path = file_uri # md5sum key added to payload by encode-utils. payload["aliases"].append(file_uri) # i.e. s3://bucket-name/key payload["aliases"].append(os.path.basename(file_uri)) if not file_path: raise Exception("Could not locate FASTQ file for SequencingResult {}; read number {}.".format(pulsar_sres_id, read_num)) payload["submitted_file_name"] = file_path dcc_rep = self.ENC_CONN.get(rec_ids=enc_replicate_id, ignore404=False) dcc_exp = dcc_rep["experiment"] payload["dataset"] = dcc_exp["accession"] dcc_exp_accession = dcc_exp["accession"] dcc_exp_type = dcc_exp["assay_term_name"] #i.e. ChIP-seq if dcc_exp_type == "ChIP-seq": if not bio.control and not bio.wild_type: controlled_by = self.get_chipseq_controlled_by(pulsar_biosample=bio, read_num=read_num, dcc_exp_id=dcc_exp_accession) if controlled_by: # Will be empty if this is a already a control SequencingResult file. payload["controlled_by"] = controlled_by else: pass #raise Exception("No controlled_by could be found for SequencingResult {} read number {}.".format(pulsar_sres_id, read_num)) # Instead of raise an Exception, let it slide. Pulsar users aren't setting the # boolean fields control and wild_type_control as they should be so it's not reliable. else: if dcc_exp_type != "ATAC-seq": raise Exception("There isn't yet support to set controlled_by for experiments of type {}.".format(dcc_exp_type)) # POST to ENCODE Portal. Don't use post() method defined here that is a wrapper over # `encode_utils.connection.Connection.post`, since the wrapper only works if the record we # are POSTING has a corresponding record type on the Portal. Since Pulsar doesn't have a # corresponding file model, we can't use the wrapper method. So we'll have to manually # set the upstream identifier in the attribute `SequencingResult.read1_upstream_identifier` # or `SequencingResult.read2_upstream_identifier`. if patch: upstream_id = self.patch(payload=payload, upstream_id=upstream_id) else: payload[self.ENC_CONN.PROFILE_KEY] = "file" res_json = self.ENC_CONN.post(payload=payload) upstream_id = res_json["accession"] if read_num == 1: sres.patch({"read1_upstream_identifier": upstream_id}) else: sres.patch({"read2_upstream_identifier": upstream_id}) return upstream_id
[docs] def get_chipseq_controlled_by(self, pulsar_biosample, read_num, dcc_exp_id): """ Given a p Returns: `list`: The upstream identifiers for the control file objects on the ENCODE Portal. """ if pulsar_biosample.control or pulsar_biosample.wild_type: return [] bio_id = pulsar_biosample.id controlled_by = [] chipseq_experiment = pulsarpy.models.ChipseqExperiment(upstream=dcc_exp_id) # First add pooled input. Normally one control but could be more. ctl_map = chipseq_experiment.paired_input_control_map() if bio_id in ctl_map: for ctl_id in ctl_map[bio_id]: ctl = pulsarpy.models.Biosample(ctl_id) lib = ctl.get_latest_library() controlled_by.extend(self.get_all_seqresult_fastq_file_accessions(lib)[read_num]) # Next add WT input wt_input_id = chipseq_experiment.wild_type_control_id if wt_input_id: wt_input = pulsarpy.models.Biosample(wt_input_id) lib = wt_input.get_latest_library() controlled_by.extend(self.get_all_seqresult_fastq_file_accessions(lib)[read_num]) return controlled_by
def get_all_seqresult_fastq_file_accessions(self, pulsar_lib): res = {1: [], 2: []} sequencing_result_ids = pulsar_lib.sequencing_result_ids for sres_id in sequencing_result_ids: sres = pulsarpy.models.SequencingResult(sres_id) r1_accession = sres.get_upstream_identifier(read_num=1) if not r1_accession: raise Exception("Upstream identifier not set for SequencingResult {}, read number {}.".format(sres_id, 1)) res[1].append(r1_accession) r2_accession = sres.get_upstream_identifier(read_num=2) if not r2_accession: raise Exception("Upstream identifier not set for SequencingResult {}, read number {}.".format(sres_id, 2)) res[2].append(r2_accession) return res
[docs] def get_barcode_details_for_ssc(self, ssc_id): """ This purpose of this method is to provide a value to the library.barcode_details property of the Library profile on the ENCODE Portal. That property taks an array of objects whose properties are the 'barcode', 'plate_id', and 'plate_location'. Args: ssc_id: The Pulsar ID for a SingleCellSorting record. """ ssc = models.SingleCellSorting(ssc_id) lib_prototype_id = ssc.library_prototype_id lib = models.Library(lib_prototype_id) paired_end = lib.paired_end plate_ids = ssc.plate_ids plates = [models.Plate(p) for p in plate_ids] results = [] for p in plates: for well_id in p.well_ids: well = models.Well(well_id) details = {} details["plate_id"] = p.name details["plate_location"] = well.name well_biosample = models.Biosample(well.biosample_id) lib_id = well_biosample.library_ids[-1] # Doesn't make sense to have more than one library for single cell experiments. lib = models.Library(lib_id) if not paired_end: barcode_id = lib.barcode_id barcode = models.Barcode(barcode_id).sequence else: pbc_id = lib.paired_barcode_id pbc = models.PairedBarcode(pbc_id) barcode = pbc.index1["sequence"] + "-" + pbc.index2["sequence"] details["barcode"] = barcode results.append(details) return results
def post_single_cell_sorting(self, rec_id, patch=False): rec = models.SingleCellSorting(rec_id) sorting_biosample_id = rec.sorting_biosample_id sorting_biosample = models.Biosample(sorting_biosample_id) payload = {} # Set the explicitly required properties first: payload["assay_term_name"] = "single-cell ATAC-seq" payload["experiment_classification"] = ["functional genomics"] # And now the rest payload["biosample_ontology"] = sorting_biosample["biosample_term_name"]["name"] desc = rec.description.strip() if desc: payload["description"] = desc payload["documents"] = self.post_documents(rec.document_ids) exp_upstream = self.post(payload=payload, dcc_profile="experiment", pulsar_model=models.SingleCellSorting, pulsar_rec_id=rec_id) # Submit biosample self.post_biosample(rec_id=sorting_biosample.id, patch=patch) # Submit library_prototype (which is linked to sorting_biosample when it is created) library_prototype_id = rec.library_prototype_id library_upstream = self.post_library(rec_id=library_prototype_id, patch=patch) # Submit replicate. # The experiment will be determined via inspection of associated biosample. replicate_upstream = self.post_replicate(library_upstream=library_upstream, patch=patch) # A SingleCellSorting has many SequencingRequests through the Plates association. sreq_ids = rec.sequencing_request_ids sreqs = [models.SequencingRequest(s) for s in sreq_ids] for sreq in sreqs: srun_ids = sreq.sequencing_run_ids sruns = [models.SequencingRun(s) for s in srun_ids] for run in sruns: storage_loc_id = run.storage_location_id sres_ids = run.sequencing_result_ids
# Submit a file record if __name__ == "__main__": s = Submit(dcc_mode="v79x0-test-master.demo.encodedcc.org") s.post_chipseq_exp(rec_id=164, patch=False) #s.post_chipseq_control_experiments(164)