Source code for bioservices.pride

#
#  This file is part of bioservices software
#
#  Copyright (c) 2013-2014 - EBI-EMBL
#
#  File author(s):
#      https://github.com/cokelaer/bioservices
#
#  Distributed under the GPLv3 License.
#  See accompanying file LICENSE.txt or copy at
#      http://www.gnu.org/licenses/gpl-3.0.html
#
#  source: http://github.com/cokelaer/bioservices
#  documentation: http://packages.python.org/bioservices
#
##############################################################################

"""Interface to PRIDE web service

.. topic:: What is PRIDE ?

    :URL: http://www.ebi.ac.uk/pride/ws/archive/v2

    .. highlights::

         The PRIDE PRoteomics IDEntifications database is a centralized,
         standards compliant, public data repository for proteomics data,
         including protein and peptide identifications, post-translational
         modifications and supporting spectral evidence.

        -- From PRIDE web site, Jan 2015


"""
import tqdm

from bioservices.services import REST
from bioservices import logger

logger.name = __name__


__all__ = ["PRIDE"]


[docs]class PRIDE:
    """Interface to the `PRIDE <http://rest.ensembl.org>`_ service



    ::

        from bioservices import PRIDE
        p = PRIDE()
        p.get_peptide_evidence(projectAccession)

    .. versionchanged:: 1.10.1

        Due to new API:

        - the method project_count was dropped.
        - get_project_list was renamed in get_project_files
        - get_assays, get_assay_count, get_assay_count_project_accession, get_assay_list were dropped in v2
        - get_protein_list, get_protein_count, get_protein_count_assay, get_protein_list, get_protein_list_assay
          replaced by get_protein_evidences method
        - get_peptide_list_assay, get_peptide_count, get_peptide_list, get_peptide_list_sequence,
          get_peptide_count_assay replaced by get_peptide_evidence.

    """

    _url = "https://www.ebi.ac.uk/pride/ws/archive/v2"

    def __init__(self, verbose=False, cache=False):
        """**Constructor**

        :param verbose: set to False to prevent informative messages
        :param cache: set to True to use caching. Not recommended for
            this service that evolves a lot
        """
        self.services = REST(name="PRIDE", url=PRIDE._url, verbose=verbose, cache=cache)

[docs]    def get_project(self, identifier):
        """Retrieve project information by accession

        List of PRIDE Archive Projects. The following method do not allows
        to perform search, for search functionality you will need to use
        the search/projects. The result list is Paginated using the pageSize and page.

        :param str identifier: a valid PRIDE identifier e.g., PRD000001

        :return: if identifier is invalid, returns an emppty dictionary {}

        .. doctest::

            >>> from bioservices import PRIDE
            >>> p = PRIDE()
            >>> res = p.get_project("PRD000001")
            >>> res['title']
            'COFRADIC proteome of unstimulated human blood platelets'

        """
        res = self.services.http_get(f"projects/{identifier}")
        if res == 400:
            logger.warning(f"Nothing found for {identifier}. may be this is not a valid identifier. Use get_projects")
            return {}
        return res

[docs]    def get_projects(self, pageSize=100, max_pages=1e9):
        """Get list of all projects"""
        N = self.get_projects_count()
        Npages = int(N / pageSize)
        Npages = min(Npages, max_pages)

        results = []
        page_count = 0
        for page in tqdm.tqdm(range(Npages)):
            res = self.services.http_get("projects", params={"pageSize": pageSize, "page": page})
            results.extend(res["_embedded"]["projects"])
            page_count += 1
            if page_count > max_pages:
                break

        return results

[docs]    def get_projects_count(self):
        res = self.services.http_get("projects")
        return res["page"]["totalElements"]

[docs]    def get_project_files(self, accession, pageSize=100, page=0, sortConditions=None, sortDirection="DESC", filters=""):
        """list projects or given criteria

        :param str accession: the accession number to look for
        :param int pageSize: how many results to return per page
        :param int page: which page (starting from 0) of the result to return
        :param str sortConditions: default is submission_date but more fields
            can be separated by comma and passed. Example: submission_date,project_title
        :param str sortDirection: the sorting order (ASC or DESC)
        :param str filters: Parameters to filter the search results. The structure of
            the filter is: field1==value1, field2==value2. Example accession==PRD000001

        ::

            >>> p = PRIDE()
            >>> results = p.get_project_files(accession="PRD000001", pageSize=10, page=1)


        In v1.10.1 due to new PRIDE API, the method **get_file_count** was dropped. You can use::

            len(results['_embedded']['files'])

        Similarly the **get_file_list** method was dropped since all results are
        stored in the output of this method


        """
        params = {
            "pageSize": pageSize,
            "page": page,
            "sortDirection": sortDirection,
            "sortConditions": sortConditions,
            "filter": filters,
        }

        res = self.services.http_get(f"projects/{accession}/files", params=params)
        try:
            res = res["list"]
        except:
            pass
        return res

[docs]    def get_protein_evidences(
        self,
        project_accession=None,
        assay_accession=None,
        reported_accession=None,
        pageSize=100,
        page=0,
        sortDirection="DESC",
        sortConditions="projectAccession",
    ):

        """Get all proteins evidence

        :param project_accession:
        :param assay_accession:
        :param reported_accession:
        :param int pageSize: how many results to return per page
        :param int page: which page (starting from 0) of the result to return
        :param str sortConditions: default is submission_date but more fields
            can be separated by comma and passed. Example: submission_date,project_title
        :param str sortDirection: the sorting order (ASC or DESC)

        ::

            p.get_protein_evidences()['_embedded']['proteinevidences']
        """

        params = {}
        if project_accession:
            params["projectAccession"] = project_accession
        if assay_accession:  # pragma: no cover
            params["assayAccession"] = assay_accession
        if reported_accession:  # pragma: no cover
            params["reportedAccession"] = reported_accession
        params["pageSize"] = pageSize
        params["page"] = page
        params["sortConditions"] = sortConditions
        params["sortDirection"] = sortDirection

        res = self.services.http_get(f"proteinevidences", params=params)
        return res

[docs]    def get_peptide_evidence(
        self,
        project_accession=None,
        assay_accession=None,
        protein_accession=None,
        peptide_evidence_accession=None,
        peptide_sequence=None,
        pageSize=100,
        page=0,
        sortDirection="DESC",
        sortConditions="projectAccession",
    ):
        """Get all the peptide evidences for an specific protein evidence


        :param project_accession:
        :param assay_accession:
        :param protein_accession:
        :param peptide_evidence_accession:
        :param peptide_sequence:
        :param int pageSize: how many results to return per page
        :param int page: which page (starting from 0) of the result to return
        :param str sortConditions: default is submission_date but more fields
            can be separated by comma and passed. Example: submission_date,project_title
        :param str sortDirection: the sorting order (ASC or DESC)

        Retrieving data from project accession should be fast::

            p.get_peptide_evidence(protein_accession="Q8IX30")

        but other methods may be slow::

            p.get_peptide_evidence(peptide_sequence="CQGSPGASKAMLSCNR")
        """
        params = {}
        if project_accession:
            params["projectAccession"] = project_accession
        if assay_accession:  # pragma: no cover
            params["assayAccession"] = assay_accession
        if protein_accession:  # pragma: no cover
            params["proteinAccession"] = protein_accession
        if peptide_evidence_accession:  # pragma: no cover
            params["peptideEvidenceAccession"] = peptide_evidence_accession
        if peptide_sequence:  # pragma: no cover
            params["peptideSequence"] = peptide_sequence
        params["pageSize"] = pageSize
        params["page"] = page
        params["sortConditions"] = sortConditions

        res = self.services.http_get(f"peptideevidences", params=params)
        return res

[docs]    def get_stats(self, name=None):
        """Retrieve statistics by Name

        If you do not have the name, just type::

            p.get_stats()

        and then, e.g., ::

            p.get_stats("SUBMISSIONS_PER_YEAR")

        """

        if name is None:
            res = self.services.http_get("stats/")
        else:
            res = self.services.http_get(f"stats/{name}")
        return res