Source code for pulsarpy_to_encodedcc.dcc_submit

# -*- coding: utf-8 -*-

###
# © 2018 The Board of Trustees of the Leland Stanford Junior University
# Nathaniel Watson
# nathankw@stanford.edu
###

"""
Required environment variables
  1) Those that are required in the pulsarpy.models module for connecting to Pulsar LIMS:
     -PULSAR_API_URL
     -PULSAR_TOKEN
  2) Those that are required in the encode_utils.connection module to connect to the ENCODE Portal:
     -DCC_API_KEY
     -DCC_SECRET_KEY

Optional environment variables:
  1) DCC_MODE_ - Specifies which ENCODE Portal host to connect to. If not set, then must be provided
     when instantiating the Submit() class.

..  _DCC_MODE: https://encode-utils.readthedocs.io/en/latest/connection.html#encode_utils.connection.Connection.dcc_mode

"""

import base64
import logging
import os
import re
import requests
import sys

import dxpy

import pulsarpy_to_encodedcc
from pulsarpy_to_encodedcc import FASTQ_FOLDER, log_error
from pulsarpy import models
import pulsarpy.utils
import encode_utils as eu
import encode_utils.aws_storage
import encode_utils.replicate
import encode_utils.connection as euc
import encode_utils.utils as euu
import pdb

error_logger = logging.getLogger(pulsarpy_to_encodedcc.ERROR_LOGGER_NAME)
# regex for finding one or more continuous spaces
space_reg = re.compile(r' +')


[docs]class IpLaneException(Exception):
    """
    Raised when posting an IP. The is a temporary class that'll be removed once the exceptions
    are handled property in the Submit.post_ip_lane method. 
    """

[docs]class ExpMissingReplicates(Exception):
    """
    Raised when trying to POST an experiment to the Portal (such as a control experiment) and
    there aren't any replicates (Biosample records) to attach to it.
    """

[docs]class MissingSequencingResult(Exception):
    pass

[docs]class MissingTargetUpstream(Exception):
    """
    Raised when submitting a record that tries to link to a DCC target, but the target record in Pulsar 
    doesn't have the upstream_identifier attribute set.
    """
    

[docs]class UpstreamNotSet(Exception):
    pass


[docs]class NoFastqFile(Exception):
    """
    Raised in Submit.post_fastq_file() when submitting either a R1 FASTQ file or a R2 FASTQ file,
    and the filepath isn't set in the corresponding SequencingResult record in Pulsar.
    """
    pass


#def dec(model_class):
#    """
#    A decorator that is to be used with the post_* methods defined in the Submit class defined below.
#    """
#
#    def wrapper(func):
#
#        def inner(self, rec_id, patch=False, *args, **kwargs):
#            """
#            Saves time by checking whether a record needs to be posted to the Portal before it's 
#            payload is constructed.  It need not be posted if in Pulsar it has the upstream_identifier
#            attribute set AND the 'patch' argument is False. The decorated method will thus only run
#            if upstream_identifier isn't set, or if the 'patch' argument is True. 
#            """
#            rec = model_class(rec_id) 
#            upstream = rec.get_upstream() 
#            if upstream and not patch:
#                # Then no need to post
#                return upstream
#            else:
#                self.func(rec_id=rec_id, patch=patch, *args, **kwargs)
#        return inner
#
#    return wrapper


[docs]class Submit():
    """
    Contains methods for submitting various types of objects in Pulsar to the ENCODE Portal. 
    """

    def __init__(self, dcc_mode=None, extend_arrays=True):
        if not dcc_mode:
            try:                                                                                    
                dcc_mode = os.environ["DCC_MODE"]                                                   
                print("Utilizing DCC_MODE environment variable.")                 
            except KeyError:                                                                        
                print("ERROR: You must supply the `dcc_mode` argument or set the environment variable DCC_MODE.")
                sys.exit(-1)                                                                        
        self.dcc_mode = dcc_mode
        self.ENC_CONN = euc.Connection(self.dcc_mode, submission=True)
        #: When patching, there is the option to extend array properties or overwrite their values.
        #: The default is to extend.
        self.extend_arrays = extend_arrays

    def filter_standard_attrs(self, payload):
        attrs = ["created_at", "id", "owner_id", "updated_at", "user_id"]
        for i in attrs:
            if i in payload:
                payload.pop(i)
        for i in payload:
            if i.startswith("_"):
                payload.pop(i)
        return payload

[docs]    def sanitize_prop_val(self, txt):
        """
        Replaces characters that can be problematic in property values on the ENCODE Portal. 
        For example, the '/' character in an alias is a problem since the alias is an identifying property
        that can be used in a URL to view the record. In this case, the '/' will be interpreted as a
        path separator. 

        Characters that get replaced: currently, just '/' with '-'. 

        Args:
            txt: `str`. The value to clean.

        Returns:
            `str`. The cleaned value that is submission acceptable. 
        """
        txt = txt.replace("/","-").strip()
        # Replace contiguous spaces with a single space
        return space_reg.sub(" ", txt)
        

[docs]    def get_vendor_id_from_encodeportal(self, pulsar_vendor_id):
        """
        Given a Pulsar Vendor record ID, returns the upstream identifier. 

        Raises:
            `UpstreamNotSet`: The Pulsar vendor.upstream_identifier attribute isn't set.
        """
        if not pulsar_vendor_id:
            return ""
        vendor = models.Vendor(pulsar_vendor_id)
        upstream = vendor.upstream_identifier
        if not upstream:
            msg = "Vendor {} with Pulsar ID {} does not have the upstream_identifier attribute set.".format(vendor.name, vendor.id)
            raise UpstreamNotSet(msg)
        return upstream
                  

[docs]    def patch(self, payload, upstream_id, dont_extend_arrays=False):
        """
        A wrapper over `encode_utils.connection.Connection.patch()`.

        Args:
            dont_extend_arrays: `bool`. Dynamic way to signal not to extend array property values. 
                If not True, then the boolean value of `self.extend_arrays` determines whether
                arrays are extended. 

        Returns:
            `dict`: The JSON response from the PATCH operation, or an empty dict if the record doesn't
                exist on the Portal.  See ``encode_utils.connection.Connection.patch()`` for more details.
        """
        payload[self.ENC_CONN.ENCID_KEY] = upstream_id
        if dont_extend_arrays:
            extend = False
        else:
            extend = self.extend_arrays
        response_json = self.ENC_CONN.patch(payload=payload, extend_array_values=extend)
        if not response_json:
            raise Exception("Couldn't PATCH record on the Portal since it doesn't exist.")
        return upstream_id

[docs]    def post(self, payload, dcc_profile, pulsar_model, pulsar_rec_id):
        """
        A wrapper over `encode_utils.connection.Connection.post()`. 

        First checks if the Pulsar record has an upstream_identifier set, and if set, returns
        it rather than attempting to re-post.

        Adds aliases to the payload being the record's record ID and name. 

        Sets the profile key in the payload.

        If the record is successfully posted to the prod ENCODE Portal, then sets the 
        upstream_identifier attribute in the Pulsar record.
    
        Args:
            payload: `dict`. The new record attributes to submit.
            dcc_profile: `str`. The name of the ENCODE Profile for this record, i.e. 'biosample',
                'genetic_modification'.
            pulsar_model: One of the defined subclasses of the ``models.Model`` class, i.e. 
                ``models.Model.Biosample``, which will be used to set the Pulsar record's 
                upstream_identifier attribute after a successful POST to the ENCODE Portal.
            pulsar_rec_id: `str`. The identifier of the Pulsar record to POST to the DCC.
        Returns:
            `str`: The upstream identifier for the new record on the ENCODE Portal, or the existing
            upstream identifier if the record already exists; see ``encode_utils.utils.get_record_id()``
            for more details. 
        """
        payload[self.ENC_CONN.PROFILE_KEY] = dcc_profile
        pulsar_rec = pulsar_model(pulsar_rec_id)
        upstream = pulsar_rec.upstream_identifier
        if upstream:
            # May sure that upstream exists. Could be that the upstream identifier belongs to a different
            # server, i.e. test or a demo, than the one we are currently connected to. 
            # No need to POST. 
            if upstream.startswith("ENC"):
                # For sure this is a production accession and we should leave it alone.
                return upstream
            exists_on_server = self.ENC_CONN.get(rec_ids=upstream)
            if exists_on_server:
                return upstream
        aliases = payload.get("aliases", [])
        abbrev_alias = pulsar_rec.abbrev_id()
        if abbrev_alias not in aliases:
            aliases.append(abbrev_alias)
        # Add value of 'name' property as an alias, if this property exists for the given model.
        try:
            name = self.sanitize_prop_val(pulsar_rec.name)
            if name:
                # Need to prepend the model abbreviation to the name since some names are the same
                # between models. For example, its common in Pulsar to have a Library named the
                # same as the Biosample it belongs to.
                alias_name = models.Model.PULSAR_LIMS_PREFIX + pulsar_rec.MODEL_ABBR + "-" + name
                if alias_name not in aliases:
                    aliases.append(alias_name)
        except KeyError:
            pass
        payload["aliases"] = aliases
    
        # `dict`. The POST response if the record didn't yet exist on the ENCODE Portal, or the
        # record JSON itself if it does already exist. Note that the dict. will be empty if the connection
        # object to the ENCODE Portal has the dry-run feature turned on.
        response_json = self.ENC_CONN.post(payload)
        upstream = euu.get_record_id(response_json)
        # Set value of the Pulsar record's upstream_identifier
        print("Setting the Pulsar record's upstream_identifier attribute to '{}'.".format(upstream))
        pulsar_rec.patch(payload={"upstream_identifier": upstream})
        print("upstream_identifier attribute set successfully.")
        return upstream

[docs]    def get_biosample_term_name_and_type(self, biosample):
        """
        Creates a dict. with the keys:
        
          biosample_term_name
          biosample_term_id
          biosample_type 

        Args:
            biosample: `pulsarpy.models.Biosample` instance.

        Returns:
            `dict`.
        """
        res = {} 
        btn = models.BiosampleTermName(biosample.biosample_term_name_id)
        res["biosample_term_name"] = btn.name
        res["biosample_term_id"] = btn.accession
        bty = models.BiosampleType(biosample.biosample_type_id)
        res["biosample_type"] = bty.name
        return res

    def get_exp_of_biosample(self, dcc_biosample_id):
        query_string =  "?searchTerm={}&type=Experiment".format(dcc_biosample_id)
        experiments = self.ENC_CONN.search(url=query_string)
        if len(experiments) > 1:
            raise Exception("Expected to find 1 experiment linked to biosample {}, instead found more: {}.".format(dcc_biosample_id, len(experiments)))
        return experiments[0]["accession"]

[docs]    def post_library_through_fastq(self, pulsar_library_id, dcc_exp_id, patch=False):
        """
        POSTS the Biosample, it's latest Library, and all SequencingResults for that Library.

        Args:
            pulsar_library_id: `int`. The ID of a Pulsar Library record.
            dcc_exp_id: `int`. The ID of the experiment record on the Portal to link the replicate to.
        """
        # POST biosample record
        pulsar_library = models.Library(pulsar_library_id)
        pulsar_biosample_id = pulsar_library.biosample_id
        biosample_upstream = self.post_biosample(pulsar_biosample_id, patch=patch)
        biosample = models.Biosample(pulsar_biosample_id)
        #if not dcc_exp_id:
        #    dcc_exp_id = self.get_exp_of_biosample(biosample_upstream)
            
        # POST library record
        library_upstream = self.post_library(rec_id=pulsar_library.id, patch=patch)
        # POST replicate record
        replicate_upstream = self.post_replicate(pulsar_library_id=pulsar_library.id, dcc_exp_id=dcc_exp_id, patch=patch)
        # POST file records for all sequencing results for the Library
        sres_ids = pulsar_library.sequencing_result_ids
        if not sres_ids:
            msg = "No SequencingResult for Library {} of Biosample {}, exiting.".format(pulsar_biosample_id, pulsar_library.id)
            error_logger.error(msg)
            raise MissingSequencingResult(msg)
        
        for i in sres_ids:
            self.post_sres(pulsar_sres_id=i, enc_replicate_id=replicate_upstream, patch=patch)

[docs]    def post_sres(self, pulsar_sres_id, enc_replicate_id, patch=False):
        """
        A wrapper over ``self.post_fastq_file()``.  Whereas ``self.post_fastq_file()`` only 
        uploads the FASTQ file for the given read number, this method calls ``self.post_fastq_file()``
        twice potentially, once for each FASTQ file in the Pulsar SequencingResult. Thus,
        if paired-end sequencing was done, ``self.post_fastq_file()`` will be called twice to upload
        the forward and reverse reads FASTQ files. 
        """
        sres = models.SequencingResult(pulsar_sres_id)
        srun = models.SequencingRun(sres.sequencing_run_id)
        sreq = models.SequencingRequest(srun.sequencing_request_id)
        self.post_fastq_file(pulsar_sres_id=sres.id, read_num=1, enc_replicate_id=enc_replicate_id, patch=patch)
        if not sreq.paired_end and sres.read2_uri:
            sres.patch({"paired_end": True})
        if sreq.paired_end:
            # Submit read2
            self.post_fastq_file(pulsar_sres_id=sres.id, read_num=2, enc_replicate_id=enc_replicate_id, patch=patch)

[docs]    def check_if_biosample_has_exp_on_portal(self, dcc_biosample_id):
        """
        Given a Portal biosample record ID, searches the Portal for associated experiment records.
        Any that are found are returned in a list. 

        Args:
            dcc_biosample_id: `str`. A biosample record identifier on the Portal.

        Returns:
            `list` of associated experiment records, where each is JSON-serialized. 

        Raises:
            `Exception`: The biosample is linked to more than one experiment.
        """
        if not dcc_biosample_id:
            return False
        exps = self.ENC_CONN.get_experiments_with_biosample(rec_id=dcc_biosample_id)
        if exps:
            # Should only exist on one experiment. exps is an array of >= 0 experiment records.
            if len(exps) > 1:
                accessions = []
                for i in exps:
                    accessions.append(i["accession"])
                msg = "Error: Biosample {} is associated to more than one Portal experiment: {}.".format(dcc_biosample_id, ", ".join(accessions))
                raise Exception(msg)
            return exps[0]
        return False
        
[docs]    def post_chipseq_ctl_exp(self, rec_id, wt_input=False, paired_input=False, exp_only=False, patch=False):
        """
        Creates a control experiment record on the ENCODE Portal for either the paired-input control
        biosample(s) or the wild-type input biosample on the Pulsar ChipseqExperiment.

        Args:
            rec_id: `int`. ID of a ChipseqExperiment record in Pulsar.
            wt_input: `bool`. True means to make a control experiment on the Portal for the wild-type
                input biosample on the Pulsar ChipseqExperiment. Note that either this or the
                `paired_input` parameter must be set to True and not both. 
            paired_input: `bool`. True means to make a control experiment on the Portal for the 
                paired-input control biosample(s) on the Pulsar ChipseqExperiment. Note that either 
                this or the `wild_type` parameter must be set to True and not both. 
            exp_only: `bool`. Only makes sense to use when the `patch` parameter is set to True.
                When `exp_only=True`, then don't PATCH Biosample records and everything downstream
                to the file records on the Portal (don't call `self.post_library_through_fastq()`).

        Returns:
            `str`: The ENCODE Portal accession of the control experiment. 

        Raises:
            `ValueError`: Both parameters `wt_input` and `paired_input` are set to False or True.
                Only one of them must be True. 
        """
        print(">>> IN post_chipseq_ctl_exp()")
        if (not wt_input and not paired_input) or (wt_input and paired_input):
            raise ValueError("Either the wt_input or the paired_input parameter must be set to True.")

        pulsar_exp = models.ChipseqExperiment(rec_id)
        input_ids = []
        if wt_input:
            experiment_type = "wild type"
            # Only 1 Wild Type input per experiment.
            if pulsar_exp.wild_type_control_id:
                input_ids.append(pulsar_exp.wild_type_control_id)
        else:
            experiment_type = "paired-input"
            # Normally there will only be one paired_input control Biosample, but there could at times
            # be another. That happens when one of the reps fail, and another rep has to be made from a
            # different cell batch than the sibling rep on the experiment.
            input_ids.extend(pulsar_exp.control_replicate_ids) # Biosample records.
        if not input_ids:
            msg = "Can't submit {} control exp. for {}: no replicates.".format(experiment_type, pulsar_exp.abbrev_id())
            log_error(msg)
            raise ExpMissingReplicates(msg)
        inputs = [models.Biosample(x) for x in input_ids]
        dcc_exp = ""
        for i in inputs:
            dcc_exp = self.check_if_biosample_has_exp_on_portal(i.upstream_identifier)
            if dcc_exp:
                break
        payload = {}
        alias = ""
        for i in inputs:
            alias += i.abbrev_id()
        alias.rstrip()
        if wt_input:
            alias_prefix = "pWT-CTL_"
        else:
            alias_prefix = "pPI-CTL_"
        alias = alias_prefix + alias 
        payload["aliases"] = [alias]
        payload.update(self.get_exp_core_payload_props(pulsar_exp_rec=pulsar_exp, assay_term_name="ChIP-seq"))
        payload["description"] = "ChIP-seq on human " + payload["biosample_term_name"]
        payload["target"] = "Control-human"
  
        # Before POSTING experiment, check if it already exists on the Portal.
        # POST experiment. Don't use self.post() since there isn't a Pulsar model for a control experiment.
        # So, use encode-utils directly to POST.
        if patch:
            if not dcc_exp:
                msg = "Can't PATCH " + alias + " control experiment since it wasn't found on the Portal."
                raise Exception(msg)
            ctl_exp_accession = self.patch(payload=payload, upstream_id=dcc_exp["accession"])
        else:
            # post
            if not dcc_exp:
                payload[self.ENC_CONN.PROFILE_KEY] = "experiment"
                dcc_exp = self.ENC_CONN.post(payload=payload)
            ctl_exp_accession = dcc_exp["accession"]

        if (patch and not exp_only) or not patch:
            for b in input_ids:
                self.post_library_through_fastq(pulsar_biosample_id=b, dcc_exp_id=ctl_exp_accession, patch=patch)
        return ctl_exp_accession

[docs]    def post_bulk_atacseq_exp(self, rec_id, patch=False, patch_all=False):
        """
        Args:
            rec_id: `int`. ID of an AtacSeq experiment record in Pulsar. Should be a bulk and not
                a single-cell experiment. 
            patch: `bool`. True means to patch the DCC experiment record.
            patch_all: `bool`. True means to patch not just the experiment record, but its sub-entities
                also, i.e. biosamples, libraries, replicates, ... Setting this to True automatically
                sets `patch` to True as well.

        Returns:
        """
        if patch_all:
            patch = True
        pulsar_exp = models.Atacseq(rec_id)
        pulsar_exp_upstream = pulsar_exp.upstream_identifier
        payload = {}
        payload.update(self.get_exp_core_payload_props(pulsar_exp_rec=pulsar_exp, assay_term_name="ATAC-seq"))
        desc = pulsar_exp.description.strip()
        if desc:
            payload["description"] = desc
        # submit experiment
        if patch:
            dcc_exp_accession = self.patch(payload=payload, upstream_id=pulsar_exp_upstream)
        if patch_all or not patch:
            dcc_exp_accession = self.post(payload=payload, dcc_profile="experiment", pulsar_model=models.Atacseq, pulsar_rec_id=rec_id)
            self.post_experimental_reps(rec_id=rec_id, experiment_type="atac-seq", patch=patch)

        return dcc_exp_accession

[docs]    def post_chipseq_exp(self, rec_id, patch=False):
        """
        Args:
            rec_id: `int`. ID of a ChipseqExperiment record in Pulsar.

        Returns:
            `str`: The ENCODE Portal accession of the control experiment. 

        Raises:
            `ValueError`: Both parameters `wt_input` and `paired_input` are set to False or True.
                Only one of them must be True. 
        """
        pulsar_exp = models.ChipseqExperiment(rec_id)
        pulsar_exp_upstream = pulsar_exp.upstream_identifier
        payload = {}
        payload.update(self.get_exp_core_payload_props(pulsar_exp_rec=pulsar_exp, assay_term_name="ChIP-seq"))
        target = models.Target(pulsar_exp.target_id)
        target_upstream = target.upstream_identifier
        if not target_upstream:
            msg = "Target {} missing upstream identifier.".format(target.abbrev_id())
            log_error(msg)
            raise MissingTargetUpstream(msg)
        payload["target"] = target.upstream_identifier
        #payload["description"] = pulsar_exp.description.strip()
        payload["description"] = target.upstream_identifier.rstrip('-human') + ' ChIP-seq on human ' + payload["biosample_term_name"]
        # submit experiment
        if patch:
            dcc_exp_accession = self.patch(payload=payload, upstream_id=pulsar_exp_upstream)
        else:
            dcc_exp_accession = self.post(payload=payload, dcc_profile="experiment", pulsar_model=models.ChipseqExperiment, pulsar_rec_id=rec_id)
            # Then POST WT-input and paired-input control experiments. The WT-input is shared across
            # multiple experiments from the same starting batch, so it's possible that it was POSTED
            # already during submission of a related experiment. 
            self.post_chipseq_control_experiments(rec_id=rec_id)
            # POST experimental biosampes
            self.post_experimental_reps(rec_id=rec_id, experiment_type="chip-seq")

        # Add-in/PATCH possible_controls property
        self.patch_chipseq_possible_controls(pulsar_exp.id)
        return dcc_exp_accession

    def patch_chipseq_possible_controls(self, pulsar_exp_id):
        possible_controls = self.get_chipseq_possible_controls(pulsar_exp_id)
        payload = {}
        payload["possible_controls"] = possible_controls
        exp = models.ChipseqExperiment(pulsar_exp_id)
        self.patch(payload=payload, upstream_id=exp.upstream_identifier, dont_extend_arrays=True)
        
    def get_chipseq_possible_controls(self, pulsar_exp_id):
        possible_controls = []
        exp = models.ChipseqExperiment(pulsar_exp_id)
        wt = models.Biosample(exp.wild_type_control_id)
        wt_upstream = wt.upstream_identifier
        wt_ctl_exp = self.check_if_biosample_has_exp_on_portal(wt_upstream)
        if not wt_ctl_exp:
            raise Exception("WT input {} on ChipseqExperiment {} doesn't have an upstream control experiment record.".format(wt.abbrev_id(), exp.abbrev_id()))
        possible_controls.append(wt_ctl_exp["accession"])
        pis = [models.Biosample(x) for x in exp.control_replicate_ids]
        for i in pis:
            upstream = i.upstream_identifier
            pi_ctl_exp = self.check_if_biosample_has_exp_on_portal(upstream)
            if not pi_ctl_exp:
                raise Exception("Paired input {} on ChipseqExperiment {} doesn't have an upstream control experiment record.".format(i.abbrev_id(), exp.abbrev_id()))
            possible_controls.append(pi_ctl_exp["accession"])
        return list(set(possible_controls))
        
[docs]    def post_chipseq_control_experiments(self, rec_id):
        """
        POSTS the WT input and the paired input controls that are associated to the indicated 
        ChipseqExperiment in Pulsar, turning each into an experiment record on the Portal.

        Args:
            rec_id: `int`. ID of a ChipseqExperiment record in Pulsar.
        """
        print(">>> IN post_chipseq_control_experiments()")
        # First the WT-input:
        self.post_chipseq_ctl_exp(rec_id=rec_id, wt_input=True)
        # Then the Paired-input, which is unique to this experiment. 
        self.post_chipseq_ctl_exp(rec_id=rec_id, paired_input=True)

[docs]    def post_experimental_reps(self, rec_id, experiment_type, patch=False):
        """
        POSTS the experimental replicates of a ChipseqExperiment or bulk Atacseq experiment object.

        Args:
            rec_id: `int`. ID of a ChipseqExperiment record in Pulsar.
            experiment_type: `str`. Either of chip-seq or atac-seq.
        """
        if experiment_type == "chip-seq":
            pulsar_exp = models.ChipseqExperiment(rec_id)
        elif experiment_type == "atac-seq":
            pulsar_exp = models.Atacseq(rec_id)
        else:
            raise Exception("Unknown experiment type '{}' passed to experiment_type parameter.".format(experiment_type))
        rep_ids = pulsar_exp.replicate_ids
        for i in rep_ids:
            self.post_library_through_fastq(pulsar_library_id=i, dcc_exp_id=pulsar_exp.upstream_identifier, patch=patch)

[docs]    def get_exp_core_payload_props(self, pulsar_exp_rec, assay_term_name):
        """
        Args:
            pulsar_exp_rec: `str`. `pulsarpy.models` subclass being either ChipSeq or Atacseq.
            assay_term_name: `str`. Either 'ChIP-seq' or ATAC-seq.
        """
        payload = {}
        first_rep_library = models.Library(pulsar_exp_rec.replicate_ids[0])
        first_rep_biosample = models.Biosample(first_rep_library.biosample_id)
        # Add biosample_term_name, biosample_term_id, and biosample_type props
        btn = models.BiosampleTermName(first_rep_biosample.biosample_term_name_id)
        bty = models.BiosampleType(first_rep_biosample.biosample_type_id)
        payload["biosample_ontology"] = self.ENC_CONN.get_biosample_type(classification=bty.name, term_id=btn.accession)["@id"]
        payload["assay_term_name"] = assay_term_name
        payload["documents"] = self.post_documents(pulsar_exp_rec.document_ids)
        payload["experiment_classification"] = ["functional genomics assay"]
        submitter_comment = pulsar_exp_rec.submitter_comments.strip()
        if submitter_comment:
            payload["submitter_comment"] = submitter_comment
        return payload
    
    def post_crispr_modification(self, rec_id, patch=False):
        rec = models.CrisprModification(rec_id)
        # CrisprConstruct(s)
        cc_ids = rec.crispr_construct_ids
        ccs = [models.CrisprConstruct(i) for i in cc_ids]
        # DonorConstruct
        dc = models.DonorConstruct(rec.donor_construct_id)
        dc_target = models.Target(dc.target_id)
        target_upstream = dc_target.upstream_identifier
        if not target_upstream:
            msg = "Target {} missing upstream identifier.".format(dc_target.abbrev_id())
            log_error(msg)
            raise MissingTargetUpstream(msg)

        payload = {}
        payload["category"] = rec.category # Required
        desc = rec.description.strip()
        if desc:
            payload["description"]
        payload["documents"] = self.post_documents(rec.document_ids)

        guide_seqs = list(c.guide_sequence for c in ccs)
        payload["guide_rna_sequences"] = guide_seqs

        if rec.category in ["insertion", "replacement"]:
            pass # The insert can be viewed in addgene. This doesn't look good to show on the Portal.
            #payload["introduced_sequence"] = dc.insert_sequence.upper()

        payload["method"] = "CRISPR"       # Required
        payload["modified_site_by_target_id"] = dc_target.upstream_identifier
        payload["purpose"] = rec.purpose   # Required

        # Note that CrisprConstruct can also has_many construct_tags. Those are not part of the donor
        # insert though. 
        construct_tags = [models.ConstructTag(i) for i in dc.construct_tag_ids]
        construct_tag_names = [x.name for x in construct_tags]
        seen_tags = []
        introduced_tags = []
        for tag in construct_tag_names:
            if tag.startswith("eGFP"):
                # Pulsar has two eGFP tags that differ by the linker sequence:
                #    1) eGFP (MH170480)
                #    2) eGFP (MH170481)
                # The Portal, however, only has eGFP and it makes most sense to submit this as 
                # simply eGFP and mention the linker used elsewhere. 
                tag = "eGFP"
            if tag not in seen_tags:
                # Avoid potential for duplicate tags, which are not allowed of course on Portal.
                seen_tags.append(tag) 
                introduced_tags.append({"name": tag, "location": "C-terminal"})
        if not introduced_tags:
            # tags are required for modifications on the Portal.
            introduced_tags = [{"name": "eGFP", "location": "C-terminal"}]
        payload["introduced_tags"] = introduced_tags
        reagents = []
        for i in ccs + [dc]:
            addgene_id = getattr(i, "addgene_id")
            if addgene_id:
                r = {}
                r["source"] = "addgene"
                r["url"] = "http://www.addgene.org/" + addgene_id
                r["identifier"] = addgene_id
                reagents.append(r)
        if reagents:
            payload["reagents"] = reagents
        # ex: ENCGM094ZOS

        if patch: 
            upstream_id = self.patch(payload=payload, upstream_id=rec.upstream_identifier)
        else:
            upstream_id = self.post(payload=payload, dcc_profile="genetic_modification", pulsar_model=models.CrisprModification, pulsar_rec_id=rec_id)
        return upstream_id
    
    def post_document(self, rec_id, patch=False):
        rec = models.Document(rec_id)
        payload = {}
        desc = rec.description.strip()
        if desc:
            payload["description"] = rec.description
        doc_type = models.DocumentType(rec.document_type_id)
        payload["document_type"] = doc_type.name
        content_type = rec.content_type
        # Create attachment for the attachment prop
        file_contents = rec.download()
        data = base64.b64encode(file_contents)
        temp_uri = str(data, "utf-8")
        href = "data:{mime_type};base64,{temp_uri}".format(mime_type=content_type, temp_uri=temp_uri)
        attachment = {}
        attachment["download"] = rec.name
        attachment["type"] = content_type 
        attachment["href"] = href
        payload["attachment"] = attachment
        if patch:
            upstream_id = self.patch(payload, rec.upstream_identifier)
        else:
            upstream_id = self.post(payload=payload, dcc_profile="document", pulsar_model=models.Document, pulsar_rec_id=rec_id)
        return upstream_id

    def post_documents(self, rec_ids, patch=False):
        upstreams = []
        for i in rec_ids:
            upstreams.append(self.post_document(rec_id=i, patch=patch))
        return upstreams

    def post_treatments(self, rec_ids, patch=False):
        upstreams = []
        for i in rec_ids:
            upstreams.append(self.post_treatment(rec_id=i, patch=patch))
        return upstreams

    def post_treatment(self, rec_id, patch=False):
        rec = models.Treatment(rec_id)
        payload = {}
        conc = rec.concentration
        if conc:
            payload["amount"] = conc
            conc_unit = models.Unit(rec.concentration_unit_id)
            payload["amount_units"] = conc_unit.name
        duration = rec.duration
        if duration:
            payload["duration"] = duration
            payload["duration_units"] = rec.duration_units
        temp = rec.temperature_celsius
        if temp:
            payload["temperature"] = temp
            payload["temperature_units"] = "Celsius"
        ttn = models.TreatmentTermName(rec.treatment_term_name_id)
        payload["treatment_term_id"] = ttn["accession"]
        payload["treatment_term_name"] = ttn["name"]
        payload["treatment_type"] = rec.treatment_type
        payload["documents"] = self.post_documents(rec.document_ids)
        # Submit
        if patch:
            upstream_id = self.patch(pyaload, rec.upstream_identifier)
        else:
            upstream_id = self.post(payload=payload, dcc_profile="treatment", pulsar_model=models.Treatment, pulsar_rec_id=rec_id)
        return upstream_id

    
[docs]    def post_vendor(self, rec_id, patch=False):
        """
        Vendors must be registered directly by the DCC personel. 
        """
        raise Exception("Vendors must be registered directly by the DCC personel.")

[docs]    def get_gel_lane_with_biosample(self, immunoblot_id, biosample_id):
        """
        Given an Immunoblot record ID, and a Biosample record ID, returns the GelLane object with the 
        given Biosample. This method assumes that a Gel won't have more than one GelLane with the same
        Biosample. 

        Note that there should only be 1 Gel, even though the Rails Immunoblot model allows many -
        on the 'to fix list'. 

        Args:
            immunoblot_id: `int`. Immunoblot record ID.
            biosample_id: `int`. Biosample record ID.

        Returns:
           `None` if the GelLane didn't pass. Otherwise, a `pulsarpy.models.GelLane` instance.  
       
        Raises:
            `IpLaneException`: One of multiple issues that could be present as indicated by the error
              message, i.e. 

                * The Biosample doesn't have an associated Gel
                * There isn't a GelLane with the Biosample on it. 
        """
        ip = models.Immunoblot(immunoblot_id)
        if not ip.gel_ids:
            raise IpLaneException("IP {} for Biosample {} does not have a Gel.".format(ip.id, biosample_id))
        gel = models.Gel(ip.gel_ids[0])
        gl = "" # GelLane
        for gel_lane_id in gel.gel_lane_ids:
            gel_lane = models.GelLane(gel_lane_id)
            if biosample_id == gel_lane.biosample_id:
                gl = gel_lane
        if not gl:
            raise IpLaneException("Could't find a GelLane that has Biosample {} on Immunoblot {}.".format(biosample_id, immunoblot_id))
        if not gl.attrs["pass"]:
            print("GelLane didn't pass for Biosample {}.".format(biosample_id))
            return None
        return gl

[docs]    def post_ip_biosample_characterization(self, immunoblot_id, biosample_id, patch=False):
        """
        Submits a Pulsar Immunoblot for a specific lane (biosample) on a Gel to the ENCODE biosample_characterization
        profile. Such an immunoblot is used to show whether the eGFP-tagged target (using CRISPR) 
        is expressed (has a band in the size range of the expected taget size).
        Only submit these after the ChipSeq experiment (and hence CrisprModification) has been submitted. Even though some Biosamples
        have a successful IP, they don't all need to be submitted. For example, in one case a Biosample
        was lost after a successful IP and hence couldn't do the crosslinking for ChIP later on.
        Another reason may be that we already have enough validated Biosamples to submit.

        This method makes the assumption that a given gel won't have more than one lane with the same
        Biosample.

        Returns:
            `None`: The Biosmaple isn't already registered on the Portal.
            `None`: The Biosample has an IP, but not one that passes (based on the GelLane.pass attribute)
            `None`: The Non-WT Biosample isn't yet registerd on the Portal
            `None`: The non-WT biosample that doesn't have a ChipSeq object.
            `int`: The ID of the created biosample_characterization record on the Portal. 
        """
        GEL_IMAGE_DIR = os.path.join(os.path.curdir, "gel_images")
        if not os.path.exists(GEL_IMAGE_DIR):
            os.mkdir(GEL_IMAGE_DIR)

        biosample = models.Biosample(biosample_id)
        if not biosample.upstream_identifier:
            # For now, don't sumbit until jadrian says otherwise. 
            print("Biosample missing upstream - skipping.")
            return None
            
        ip = models.Immunoblot(immunoblot_id)
        gl = self.get_gel_lane_with_biosample(immunoblot_id=immunoblot_id, biosample_id=biosample_id)
        if not gl:
            return None

        payload = {}
        if not biosample.wild_type:
            # Find WT parent that has an associated Immunoblot to use as control
            # and set that Biosample's upstream_identifier as the value of the ENCODE property 
            # biosample_characterization.wildtype_biosample. Currently, the WT biosample is determined
            # soley by biosample_term_name. 
            biosample_term_name = models.BiosampleTermName(biosample.biosample_term_name_id).name
            if biosample_term_name == "A549":
                wt_biosample_id = 2551
            elif biosample_term_name == "GM23338":
                wt_biosample_id = 2559
            elif biosample_term_name == "HepG2":
                wt_biosample_id = 2510
            elif biosample_term_name == "MCF-7":
                wt_biosample_id = 2515
            elif biosample_term_name == "SK-N-SH":
                wt_biosample_id = 11200
            else:
                msg = "Can't submit IP biosample_characterization for Biosample {} IP {} since the wild type biosample with its own Immunoblot can't be determined for biosample term name {}.".format(biosample.id, immunoblot_id, biosample_term_name)
                error_logger.error(msg)
                return None
            wt_biosample = models.Biosample(wt_biosample_id)
            wt_biosample_upstream = wt_biosample.upstream_identifier
            if not wt_biosample_upstream:
                print("POSTING WT parent Biosample.")
                wt_biosample_upstream = self.post_biosample(wt_biosample_id)
            # Then POST the Immunoblot linked to the WT Parent Biosample. Note that it's possible
            # but unlikely for a Biosample to be linked to multiple Immunoblots. In that case, the
            # first one will be submitted. 
            wt_ip_id = wt_biosample.immunoblot_ids[0]
            print("POSTING WT parent Biosample's Immunoblot.")            
            self.post_ip_biosample_characterization(immunoblot_id=wt_ip_id, biosample_id=wt_biosample_id, patch=False)
            payload["wildtype_biosample"] = wt_biosample_upstream
        
        payload["characterization_method"] = "immunoblot"
        payload["characterizes"] = biosample.upstream_identifier
        payload["documents"] = self.post_documents(ip.document_ids)
        payload["review"] = {"lab": "richard-myers", "lane": gl.lane_number}
        # Process attachment property for gel image.
        # A Pulsar Gel object can have many GelImages (different exposure times), but Jess has indicated
        # to take the first one if multiple are present. 
        gel = models.Gel(gl.gel_id)
        if not gel.gel_image_ids:
            msg = "GelLane {} of Gel {} for Biosample {} is missing a GelImage.".format(gl.id, gel.id, biosample_id)
            error_logger.error(msg)
            raise IpLaneException(msg)
        gel_image = models.GelImage(sorted(gel.gel_image_ids)[0])
        # The image URI is expected to have public read permission.
        # Some paths store a // at the beginning to tell the browser to use the same protocol as it's currently
        # using (HTTP/HTTPS). In that case, just prefix it with 'https:'.
        image_uri = gel_image.image
        if image_uri.startswith("//"):
            image_uri = "https:" + image_uri
        image_basename = os.path.basename(image_uri)
        image_exists_locally = os.path.join(GEL_IMAGE_DIR, image_basename)
        if not os.path.exists(image_exists_locally):
            # Then download it
            stream = requests.get(image_uri, stream=True)
            fout = open(image_exists_locally, 'wb')
            for line in stream.iter_content(chunk_size=512):
                fout.write(line)
            fout.close()
        payload["attachment"] = {"path": image_exists_locally}
        # Caption
        btn = models.BiosampleTermName(biosample.biosample_term_name_id).name
        caption = "Immunoprecipitation was performed on nuclear extracts from biosample {}".format(biosample.upstream_identifier)
        if biosample.wild_type:
            caption += " ({} wild type)".format(btn)
        else:
            if not biosample.chipseq_experiment_ids:
                msg = "Biosample {} is not linked to any ChipSeq experiments.".format(biosample_id)
                error_logger.error(msg)
                return None
                #raise IpLaneException(msg)
            elif len(biosample.chipseq_experiment_ids) > 1:
                msg = "Biosample {} is linked to more than 1 ChipSeq experiment. It is not known as to which one this IP relates.".format(biosample_id)
                error_logger.error(msg)
                raise IpLaneException(msg)
            chipseq_exp = models.ChipseqExperiment(biosample.chipseq_experiment_ids[0])
            if not chipseq_exp.upstream_identifier:
                msg = "ChipSeq experiment {} for Biosample {} needs to be submitted prior to submitting the IP biosample_characterization.".format(chipseq_exp.id, biosample_id)
                error_logger.error(msg)
                raise IpLaneException(msg)
            crispr_modification = models.CrisprModification(biosample.crispr_modification_id)
            if not crispr_modification.upstream_identifier:
                msg = "Biosample {} has a CrisprModification, but it isn't yet registered on the Portal.".format(biosample_id)
                error_logger.error(msg)
                raise IpLaneException(msg)
            crispr_construct = models.CrisprConstruct(crispr_modification.crispr_construct_ids[0])
            target = models.Target(crispr_construct.target_id)
            # Get biosample_replicate_number on experiment in Portal
            rep_hash = encode_utils.replicate.ExpReplicates(self.ENC_CONN, chipseq_exp.upstream_identifier).rep_hash
            brn = rep_hash[biosample.upstream_identifier]["brn"]
            caption += " ({} eGFP-{} replicate {})".format(btn, target.name, brn)
        caption += " cells using anti-eGFP antibody. The image shows western blot analysis of input"
        caption += " (lane 1),"
        if not biosample.wild_type:
            caption += " immunoprecipitate (lane 2), and mock immunoprecipitate using IgG (lane 3)."
        else:
            caption += " and immunoprecipitate (lane 2)."
        caption += " Molecular weight standard (Bio-Rad, cat. # 161-0374) contains 10"
        caption += " pre-stained recombinant proteins (250, 150, 100, 75, 50, 37, 25, 20, 15, and 10 kD)."
        if not biosample.wild_type:
            caption += " The target molecular weight is {} kD as indicated with an arrow.".format(gl.expected_product_size)
        if gl.low_target_band_intensity:
             caption += " Lower size bands which may be due to potential degradation products are marked with asterisks."
       
        payload["caption"] = caption

        submitter_comment = ip.submitter_comments
        if submitter_comment:
            payload["submitter_comment"] = submitter_comment

        # Submit payload
        if patch:  
            upstream_id = self.patch(payload, gl.upstream_identifier)
        else:
            upstream_id = self.post(payload=payload, dcc_profile="biosample_characterization", pulsar_model=models.GelLane, pulsar_rec_id=gl.id)
        return upstream_id

    def post_biosample(self, rec_id, patch=False):
        rec = models.Biosample(rec_id)
        # The alias lab prefixes will be set in the encode_utils package if the DCC_LAB environment
        # variable is set.
        payload = {}
        # Add biosample_term_name, biosample_term_id, biosample_type props
        btn = models.BiosampleTermName(rec.biosample_term_name_id)
        bty = models.BiosampleType(rec.biosample_type_id)
        payload["biosample_ontology"] = self.ENC_CONN.get_biosample_type(classification=bty.name, term_id=btn.accession)["@id"]

        date_biosample_taken = rec.date_biosample_taken
        if date_biosample_taken:
            if bty.name == "tissue":
                payload["date_obtained"] = date_biosample_taken
            else:
                payload["culture_harvest_date"] = date_biosample_taken

        desc = rec.description.strip()
        if desc:
            payload["description"] = desc

        donor = models.Donor(rec.donor_id)
        donor_upstream = donor.get_upstream() 
        if not donor_upstream:
            raise Exception("Donor '{}' of biosample '{}' does not have its upstream set. Donors must be registered with the DCC directly.".format(donor.id, rec_id))
        payload["donor"] = donor_upstream

        lot_id = rec.lot_identifier
        if lot_id:
            payload["lot_id"] = lot_id

        nih_cert = rec.nih_institutional_certification
        if nih_cert:
            payload["nih_institutional_certification"] = nih_cert

        payload["organism"] = "human"

        passage_number = rec.passage_number
        if passage_number:
            payload["passage_number"] = passage_number

        starting_amount = rec.starting_amount
        if starting_amount:
            payload["starting_amount"] = starting_amount
            payload["starting_amount_units"] = models.Unit(rec.starting_amount_units_id).name

        submitter_comment = rec.submitter_comments
        if submitter_comment:
            payload["submitter_comment"] = submitter_comment

        preservation_method = rec.tissue_preservation_method
        if preservation_method:
            payload["preservation_method"] = preservation_method

        prod_id = rec.vendor_product_identifier
        if prod_id:
            payload["product_id"] = prod_id
    
        cm_id = rec.crispr_modification_id
        if cm_id:
            payload["genetic_modifications"] = [self.post_crispr_modification(cm_id)]
    
        payload["documents"] = self.post_documents(rec.document_ids)
    
        part_of_biosample_id = rec.part_of_id
        if part_of_biosample_id:
            part_of_biosample = models.Biosample(part_of_biosample_id)
            pob_upstream = part_of_biosample.get_upstream() 
            if not pob_upstream or not pob_upstream.startswith("ENCBS"):
                pob_upstream = self.post_biosample(part_of_biosample_id)
            payload["part_of"] = pob_upstream
    
        pooled_from_biosample_ids = rec.pooled_from_biosample_ids
        if pooled_from_biosample_ids:
            pooled_from_biosamples = [models.Biosample(p) for p in pooled_from_biosample_ids]
            payload["pooled_from"] = []
            for p in pooled_from_biosamples:
                p_upstream = p.get_upstream() 
                if not p_upstream:
                    p_upstream = self.post_biosample(p.id)
                payload["pooled_from"].append(p_upstream)
    
        if rec.vendor_id:
            payload["source"] = self.get_vendor_id_from_encodeportal(rec.vendor_id)
        else:
            payload["source"] = "michael-snyder"
    
        payload["treatments"] = self.post_treatments(rec.treatment_ids)
   
        if patch:  
            upstream_id = self.patch(payload, rec.upstream_identifier)
        else:
            upstream_id = self.post(payload=payload, dcc_profile="biosample", pulsar_model=models.Biosample, pulsar_rec_id=rec_id)
        return upstream_id

[docs]    def post_library(self, rec_id, patch=False):
        """
        This method will check whether the biosample associated to this library is submitted. If it
        isn't, it will first submit the biosample. 
        """
        rec = models.Library(rec_id)
        payload = {}
        biosample = models.Biosample(rec.biosample_id)
        # If this Library record is a SingleCellSorting.library_prototype, then the Biosample it will
        # be linked to is the SingleCellSorting.sorting_biosample.
        payload["biosample"] = biosample.upstream_identifier
        payload["documents"] = self.post_documents(rec.document_ids)
        fragmentation_method_id = rec.library_fragmentation_method_id
        if fragmentation_method_id:
            fragmentation_method = models.LibraryFragmentationMethod(fragmentation_method_id)
            payload["fragmentation_methods"] = [fragmentation_method.name]
        payload["lot_id"] = rec.lot_identifier
        payload["nucleic_acid_term_name"] = models.NucleicAcidTerm(rec.nucleic_acid_term_id).name
        payload["product_id"] = rec.vendor_product_identifier
        payload["size_range"] = rec.size_range
        payload["strand_specificity"] = bool(rec.strand_specific)
        if rec.vendor_id:
            payload["source"] = self.get_vendor_id_from_encodeportal(rec.vendor_id)
        else:
            payload["source"] = "michael-snyder"
        ssc_id = rec.single_cell_sorting_id
        if ssc_id:
           barcode_details = self.get_barcode_details_for_ssc(ssc_id=ssc_id)
           payload["barcode_details"] = barcode_details

        # Submit payload
        if patch:  
            upstream_id = self.patch(payload, rec.upstream_identifier)
        else:
            upstream_id = self.post(payload=payload, dcc_profile="library", pulsar_model=models.Library, pulsar_rec_id=rec_id)
        return upstream_id

[docs]    def post_replicate(self, pulsar_library_id, dcc_exp_id, patch=False):
        """
        Submits a replicate record, linked to the specified library and experiment. 
        First, replicates on the experiment will be searched to see if a replicate already exists 
        for a specifc biosample and library combination, and if so then that repicate's JSON from 
        the ENCODE Portal is returned.

        If the associated experiment is ChIP-seq, and isn't a control experpiment, then the 
        replicate will be submitted with a link to antibody ENCAB728YTO (AB-9 in Pulsar), which is 
        the GFP-specific antibody used to pull down GFP-tagged TFs. 

        Args:
            pulsar_library_id: `int`. The ID of a Library record in Pulsar.
            dcc_exp_id: `int`. The ID of the experiment record on the Portal to link the replicate to.

        Returns:
            `str`: The replicate.uuid property value of the record on the ENCODE Portal.
        """
        # Required fields to submit to a replicate are:
        #  -biological_replicate_number
        #  -experiment
        #  -technical_replicate_number

        #dcc_lib = self.ENC_CONN.get(ignore404=False, rec_ids=dcc_library_id)       
        print(">>> In dcc_submit.post_replicate()")
        payload = {}
        lib = models.Library(pulsar_library_id)
        payload["library"] = lib.upstream_identifier
        biosample_id = lib.biosample_id
        biosample = models.Biosample(biosample_id)
        payload["aliases"] = [biosample.upstream_identifier + "-" + lib.upstream_identifier]
        
        payload["experiment"] = dcc_exp_id
        dcc_exp = self.ENC_CONN.get(rec_ids=dcc_exp_id)
        # Check if replicate already exists for this library
        exp_reps_instance = encode_utils.replicate.ExpReplicates(self.ENC_CONN, dcc_exp_id)
        rep_json = exp_reps_instance.get_rep(biosample_accession=biosample.upstream_identifier, library_accession=lib.upstream_identifier)
        brn = ""
        trn = ""
        if rep_json and not patch:
            return rep_json["uuid"]
        elif rep_json:
            brn = rep_json["biological_replicate_number"]
            trn = rep_json["technical_replicate_number"]
        else:
           # Then there isn't a replicate yet for this library, and maybe not even the biosample.
            if not biosample.upstream_identifier in exp_reps_instance.rep_hash:
                brn = exp_reps_instance.suggest_brn()
                trn = 1
            else:
                brn = exp_reps_instance.rep_hash[biosample.upstream_identifier]["brn"]
                trn = exp_reps_instance.suggest_trn(biosample.upstream_identifier)
         
        if dcc_exp["assay_term_name"] == "ChIP-seq":
            # Only add antibody if not replicate on control experiment 
            if not dcc_exp["target"]["uuid"] == "89839f28-ad35-4bb4-a214-ee65d0a97d8d": # Control-human target
                payload["antibody"] = "ENCAB728YTO" #AB-9 in Pulsar
        #payload["aliases"] = 
        # Set biological_replicate_number and technical_replicate_number. For ssATAC-seq experiments,
        # these two attributes don't really make sense, but they are required to submit, so ...
            
        payload["biological_replicate_number"] = brn
        payload["technical_replicate_number"] = trn
        # Submit payload
        if patch:  
            upstream_id = self.patch(payload, rep_json["uuid"])
        else:
            # POST to ENCODE Portal. Don't use post() method defined here that is a wrapper over 
            # `encode_utils.connection.Connection.post`, since the wrapper only works if the record we
            # are POSTING has a corresponding record type on the Portal. Since Pulsar doesn't have a 
            # corresponding replicate model, we can't use the wrapper method. 
            payload[self.ENC_CONN.PROFILE_KEY] = "replicate"
            res_json = self.ENC_CONN.post(payload=payload)
            upstream_id = euu.get_record_id(res_json)
        return upstream_id

    def upload_fastq_files(self, dcc_biosample_id):
        pulsar_bio = models.Biosample(upstream_id=dcc_biosample_id)
        
        

[docs]    def post_fastq_file(self, pulsar_sres_id, read_num, enc_replicate_id, patch=False):
        """
        Creates a file record on the ENCODE Portal. Checks the SequencingResult in Pulsar to see 
        where the file is stored. If stored in DNAnexus, the file will be downloaded locally into 
        the directory given by ``pulsarpy_to_encodedcc.FASTQ_FOLDER`` (the download folder will be
        checked first to see if the file was previously downloaded before attempting to download.
        
        After the file object is created on the ENCODE Portal, it's accession will be stored as the
        upstream identifier in the Pulsar SequencingResult record for the given read.  Thus, if a 
        file object was creatd for a R1 FASTQ file, then the `SequencingResult.read1_upstream_identifier`
        attribute is updated. If instead a file object was created for a R2 FASTQ file, then the
        `SequencingResult.read2_upstream_identifier`` attribute is updated. 

        Some rather complex logic is used to determine the control FASTQ files when submitting an 
        experimental replicate's FASTQ file. If the Biosample associated with the SequencingResult
        is part of a ChipseqExperiment, then the control biosamples consist of the paired input(s) and
        the wild type input, which in Pulsar are given the attribute names 
        `ChipseqExperiment.control_replicates` and `ChipseqExperiment.wild_type_control`. A non-control
        file object on the ENCODE Portal needs to have the ``controlled_by`` property set, which
        points to one or more control FASTQ file accessions on the ENCODE Portal. 
        We normally submit them by matching read numbers, so if the file object we are creating is 
        for a R1 FASTQ file, then all the controlled_by accessions are also R1 FASTQ files. The challenge
        is in knowing which SequencingResult set to use for control FASTQ files. Since a Biosample can
        have multiple Libraries, which can have multiple SequencingRequests, which can have multiple
        SequencingRuns, there can be many sets of SequencingResults. However, since in most cases
        there will only be one of each, the approach taken here is to use the SequencingResults of
        the latest SequencingRun of the latest SequencingRequest. Once this simplicity fails to hold,
        an updated approach will need to be taken. 

        If you have alreay created the file record on the Portal and for some reason the FASTQs didn't
        upload, you can try to reupload the FASTQs by calling this method with patch equal to False. 

        Args:
            pulsar_sres_id: A SequencingResult record in Pulsar. 
            read_num: `int`. being either 1 or 2. Use 1 for the forwrard reads FASTQ file, and 2
                for the reverse reads FASTQ file. A SequencingResult in Pulsar stores the location
                of both files (if paired-end sequening).
            end_replicate_id: `str`. The identifier of the DCC replicate record that the file record
                is to be associated with.

        Returns:
            `dict`. The response from the encode-utils POST or PATCH operation.
        """
        sres = models.SequencingResult(pulsar_sres_id)
        lib = models.Library(sres.library_id)
        bio = models.Biosample(lib.biosample_id)
        srun = models.SequencingRun(sres.sequencing_run_id)
        sreq = models.SequencingRequest(srun.sequencing_request_id)
        platform = models.SequencingPlatform(sreq.sequencing_platform_id)
        payload = {}
        payload["aliases"] = []
        payload["read_length"] = 100
        payload["file_format"] = "fastq"
        payload["output_type"] = "reads"
        # The Pulsar SequencingPlatform must already have the upstream_identifier attribute set.
        payload["platform"] = platform.upstream_identifier
        # set flowcell_details
        flowcell_details = {}
        flowcell_details["barcode"] = lib.get_barcode_sequence()
        flowcell_details["lane"] = str(srun.lane)
        payload["flowcell_details"] = [flowcell_details]
        payload["replicate"] = enc_replicate_id
        #if sreq.paired_end:
        #    payload["run_type"] = "paired-ended"
        #else:
        #    payload["run_type"] = "single-ended"
        payload["run_type"] = "paired-ended"
        if read_num == 1:
            payload["paired_end"] = "1"
            file_uri = sres.read1_uri
            upstream_id = sres.read1_upstream_identifier
            read_count = sres.read1_count
        elif read_num == 2:
            payload["paired_end"] = "2"
            file_uri = sres.read2_uri
            upstream_id = sres.read2_upstream_identifier
            read_count = sres.read2_count
            # Need to set paired_with key in the payload. In this case, 
            # it is expected that R1 has been already submitted.
            if not sres.read1_upstream_identifier:
                raise Exception("Can't set paired_with for SequencingResult {} since R1 doesn't have an upstream set.".format(sres.id))
            payload["paired_with"] = sres.read1_upstream_identifier
        if not file_uri:
            raise NoFastqFile("SequencingResult '{}' for R{} does not have a FASTQ file path set.".format(pulsar_sres_id, read_num))
        elif not upstream_id and patch:
            raise Exception("Can't PATCH file object on the Portal when the SequencingResult {} for read {} doesn't have an upstream ID set.".format(pulsar_sres_id, read_num))

        data_storage = models.DataStorage(srun.data_storage_id)
        data_storage_provider = models.DataStorageProvider(data_storage.data_storage_provider_id)
        # Initialize file_path to be empty string.
        file_path = ""
        if data_storage_provider.name == "DNAnexus":
            dx_file = dxpy.DXFile(dxid=file_uri)
            file_path = os.path.join(FASTQ_FOLDER, dx_file.name)
            # Check if file exists and is non-empty in download directory before attempting to download.
            if not patch:
                if not os.path.exists(file_path) or not os.path.getsize(file_path):
                    # Download file.
                    dxpy.download_dxfile(dxid=file_uri, filename=file_path, show_progress=True)
            file_ref = "dnanexus${}".format(file_uri)
            payload["aliases"].append(file_ref)
            payload["aliases"].append(dx_file.name)
        elif data_storage_provider == "AWS S3 Bucket":
            file_path = file_uri
            # md5sum key added to payload by encode-utils.
            payload["aliases"].append(file_uri) # i.e. s3://bucket-name/key
            payload["aliases"].append(os.path.basename(file_uri))

        if not file_path:
            raise Exception("Could not locate FASTQ file for SequencingResult {}; read number {}.".format(pulsar_sres_id, read_num))
        payload["submitted_file_name"] = file_path 

        dcc_rep = self.ENC_CONN.get(rec_ids=enc_replicate_id, ignore404=False)
        dcc_exp = dcc_rep["experiment"]
        payload["dataset"] = dcc_exp["accession"]
        dcc_exp_accession = dcc_exp["accession"]
        dcc_exp_type = dcc_exp["assay_term_name"] #i.e. ChIP-seq
        if dcc_exp_type == "ChIP-seq":
            if not bio.control and not bio.wild_type:
                controlled_by = self.get_chipseq_controlled_by(pulsar_biosample=bio, read_num=read_num, dcc_exp_id=dcc_exp_accession)
                if controlled_by:
                    # Will be empty if this is a already a control SequencingResult file.
                    payload["controlled_by"] = controlled_by
                else:
                    pass
                    #raise Exception("No controlled_by could be found for SequencingResult {} read number {}.".format(pulsar_sres_id, read_num))
                    # Instead of raise an Exception, let it slide. Pulsar users aren't setting the 
                    # boolean fields control and wild_type_control as they should be so it's not reliable. 
        else:
            if dcc_exp_type != "ATAC-seq":
                raise Exception("There isn't yet support to set controlled_by for experiments of type {}.".format(dcc_exp_type))

        # POST to ENCODE Portal. Don't use post() method defined here that is a wrapper over 
        # `encode_utils.connection.Connection.post`, since the wrapper only works if the record we
        # are POSTING has a corresponding record type on the Portal. Since Pulsar doesn't have a 
        # corresponding file model, we can't use the wrapper method. So we'll have to manually 
        # set the upstream identifier in the attribute `SequencingResult.read1_upstream_identifier`
        # or `SequencingResult.read2_upstream_identifier`.
        if patch:
            upstream_id = self.patch(payload=payload, upstream_id=upstream_id)
        else:
            payload[self.ENC_CONN.PROFILE_KEY] = "file"
            res_json = self.ENC_CONN.post(payload=payload)
            upstream_id = res_json["accession"]
            if read_num == 1:
                sres.patch({"read1_upstream_identifier": upstream_id})
            else:
                sres.patch({"read2_upstream_identifier": upstream_id})
        return upstream_id
        
[docs]    def get_chipseq_controlled_by(self, pulsar_biosample, read_num, dcc_exp_id):
        """
        Given a p
        Returns:
            `list`: The upstream identifiers for the control file objects on the ENCODE Portal.
        """
        if pulsar_biosample.control or pulsar_biosample.wild_type:
            return []
        bio_id = pulsar_biosample.id
        controlled_by = []
        chipseq_experiment = pulsarpy.models.ChipseqExperiment(upstream=dcc_exp_id)
        # First add pooled input. Normally one control but could be more. 
        ctl_map = chipseq_experiment.paired_input_control_map() 
        if bio_id in ctl_map:
            for ctl_id in ctl_map[bio_id]:
                ctl = pulsarpy.models.Biosample(ctl_id)
                lib = ctl.get_latest_library()
                controlled_by.extend(self.get_all_seqresult_fastq_file_accessions(lib)[read_num])
        # Next add WT input
        wt_input_id = chipseq_experiment.wild_type_control_id
        if wt_input_id:
            wt_input = pulsarpy.models.Biosample(wt_input_id)
            lib = wt_input.get_latest_library()
            controlled_by.extend(self.get_all_seqresult_fastq_file_accessions(lib)[read_num])
        return controlled_by

    def get_all_seqresult_fastq_file_accessions(self, pulsar_lib):
        res = {1: [], 2: []}
        sequencing_result_ids = pulsar_lib.sequencing_result_ids
        for sres_id in sequencing_result_ids:
            sres = pulsarpy.models.SequencingResult(sres_id)
            r1_accession = sres.get_upstream_identifier(read_num=1)
            if not r1_accession:
                raise Exception("Upstream identifier not set for SequencingResult {}, read number {}.".format(sres_id, 1))
            res[1].append(r1_accession)
            r2_accession = sres.get_upstream_identifier(read_num=2)
            if not r2_accession:
                raise Exception("Upstream identifier not set for SequencingResult {}, read number {}.".format(sres_id, 2))
            res[2].append(r2_accession)
        return res

[docs]    def get_barcode_details_for_ssc(self, ssc_id):
        """
        This purpose of this method is to provide a value to the library.barcode_details property
        of the Library profile on the ENCODE Portal. That property taks an array of objects whose
        properties are the 'barcode', 'plate_id', and 'plate_location'. 

        Args:
            ssc_id: The Pulsar ID for a SingleCellSorting record.
        """
        ssc = models.SingleCellSorting(ssc_id)
        lib_prototype_id = ssc.library_prototype_id
        lib = models.Library(lib_prototype_id)
        paired_end = lib.paired_end
        plate_ids = ssc.plate_ids
        plates = [models.Plate(p) for p in plate_ids]
        results = []
        for p in plates:
            for well_id in p.well_ids:
                well = models.Well(well_id)
                details = {}
                details["plate_id"] = p.name
                details["plate_location"] = well.name
                well_biosample = models.Biosample(well.biosample_id)
                lib_id = well_biosample.library_ids[-1]
                # Doesn't make sense to have more than one library for single cell experiments. 
                lib = models.Library(lib_id)
                if not paired_end:
                    barcode_id = lib.barcode_id
                    barcode = models.Barcode(barcode_id).sequence
                else:
                    pbc_id = lib.paired_barcode_id
                    pbc = models.PairedBarcode(pbc_id)
                    barcode = pbc.index1["sequence"] + "-" + pbc.index2["sequence"]
                details["barcode"] = barcode
                results.append(details)
        return results

    def post_single_cell_sorting(self, rec_id, patch=False):
        rec = models.SingleCellSorting(rec_id)
        sorting_biosample_id = rec.sorting_biosample_id
        sorting_biosample = models.Biosample(sorting_biosample_id)
        payload = {}
        # Set the explicitly required properties first:
        payload["assay_term_name"] = "single-cell ATAC-seq"
        payload["experiment_classification"] = ["functional genomics"]
        # And now the rest
        payload["biosample_ontology"] = sorting_biosample["biosample_term_name"]["name"]
        desc = rec.description.strip()
        if desc:
            payload["description"] = desc
        payload["documents"] = self.post_documents(rec.document_ids)
        exp_upstream = self.post(payload=payload, dcc_profile="experiment", pulsar_model=models.SingleCellSorting, pulsar_rec_id=rec_id)

        # Submit biosample
        self.post_biosample(rec_id=sorting_biosample.id, patch=patch)
        # Submit library_prototype (which is linked to sorting_biosample when it is created)
        library_prototype_id = rec.library_prototype_id
        library_upstream = self.post_library(rec_id=library_prototype_id, patch=patch)
        # Submit replicate.
        # The experiment will be determined via inspection of associated biosample. 
        replicate_upstream = self.post_replicate(library_upstream=library_upstream, patch=patch)
        # A SingleCellSorting has many SequencingRequests through the Plates association.
        sreq_ids = rec.sequencing_request_ids 
        sreqs = [models.SequencingRequest(s) for s in sreq_ids]
        for sreq in sreqs:
            srun_ids = sreq.sequencing_run_ids
            sruns = [models.SequencingRun(s) for s in srun_ids]
            for run in sruns:
                storage_loc_id = run.storage_location_id
                sres_ids = run.sequencing_result_ids
                # Submit a file record

if __name__ == "__main__":
    s = Submit(dcc_mode="v79x0-test-master.demo.encodedcc.org") 
    s.post_chipseq_exp(rec_id=164, patch=False)
    #s.post_chipseq_control_experiments(164)