#
# This file is part of bioservices software
#
# Copyright (c) 2013-2016 - EBI-EMBL
# Copyright (c) 2016-2021 - Institut Pasteur, Paris
#
# Distributed under the GPLv3 License.
# See accompanying file LICENSE.txt or copy at
# http://www.gnu.org/licenses/gpl-3.0.html
#
# website: https://github.com/cokelaer/bioservices
# documentation: http://packages.python.org/bioservices
#
##############################################################################
"""Interface to some part of the UniProt web service
.. topic:: What is UniProt ?
:URL: http://www.uniprot.org
:Citation:
.. highlights::
"The Universal Protein Resource (UniProt) is a comprehensive resource for protein
sequence and annotation data. The UniProt databases are the UniProt
Knowledgebase (UniProtKB), the UniProt Reference Clusters (UniRef), and the
UniProt Archive (UniParc). The UniProt Metagenomic and Environmental Sequences
(UniMES) database is a repository specifically developed for metagenomic and
environmental data."
-- From Uniprot web site (help/about) , Dec 2012
.. mapping between uniprot and bunch of other DBs.
.. ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/
.. http://www.uniprot.org/docs/speclist
.. http://www.uniprot.org/docs/pkinfam
"""
import io
import time
import urllib
import json
import tqdm
import pandas as pd
from bioservices.services import REST
from bioservices import logger
logger.name = __name__
__all__ = ["UniProt"]
[docs]class UniProt:
"""Interface to the `UniProt <http://www.uniprot.org>`_ service
::
>>> from bioservices import UniProt
>>> u = UniProt(verbose=False)
>>> u.mapping("UniProtKB_AC-ID", "KEGG", query='P43403')
defaultdict(<type 'list'>, {'P43403': ['hsa:7535']})
>>> res = u.search("P43403")
# Returns sequence on the ZAP70_HUMAN accession Id
>>> sequence = u.search("ZAP70_HUMAN", columns="sequence")
.. versionchanged:: 1.10
Uniprot update its service in June 2022. Changes were made in the bioservices
API with small changes. User API is more or less the same. Main issues that may
be faced are related to change of output column names. Please see the
:attr:`_legacy_names` for corresponding changes.
Some notes about searches. The *and* and *or* are now upper cases.
The *organism* and *taxonomy* fields are now *organism_id* and *taxonomy_id*
"""
# June 2022, API changes and these labels changed:
_legacy_names = {
"id": "accession",
"entry name": "id",
"genes": "gene_names",
"genes(PREFERRED)": "gene_primary",
"genes(ALTERNATIVE)": "gene_synonym",
"genes(OLN)": "gene_oln",
"genes(ORF)": "gene_orf",
"organism": "organism_name",
"organism-id": "organism_id",
"protein names": "protein_name",
"proteome": "xref_proteomes",
"lineage(ALL)": "lineage",
"virus hosts": "virus_hosts",
"comment(ALTERNATIVE PRODUCTS)": "cc_alternative_products",
"feature(ALTERNATIVE SEQUENCE)": "ft_var_seq",
"comment(ERRONEOUS GENE MODEL PREDICTION)": "error_gmodel_pred",
"fragment": "fragment",
"encodedon": "organelle",
"length": "length",
"mass": "mass",
"comment(MASS SPECTROMETRY)": "cc_mass_spectrometry",
"feature(NATURAL VARIANT)": "ft_variant",
"feature(NON ADJACENT RESIDUES)": "ft_non_cons",
"feature(NON STANDARD RESIDUE)": "ft_non_std",
"feature(NON TERMINAL RESIDUE)": "ft_non_ter",
"comment(POLYMORPHISM)": "cc_polymorphism",
"comment(RNA EDITING)": "cc_rna_editing",
"sequence": "sequence",
"comment(SEQUENCE CAUTION)": "cc_sequence_caution",
"feature(SEQUENCE CONFLICT)": "ft_conflict",
"feature(SEQUENCE UNCERTAINTY)": "ft_unsure",
"version(sequence)": "sequence_version",
# function
"comment(ABSORPTION)": "absorption",
"feature(ACTIVE SITE)": "ft_act_site",
"comment(ACTIVITY REGULATION)": "cc_activity_regulation",
"feature(BINDING SITE)": "ft_binding",
"chebi": "ft_ca_bind",
"chebi(Catalytic activity)": "cc_catalytic_activity",
"chebi(Cofactor)": "cc_cofactor",
"feature(DNA BINDING)": "ft_dna_bind",
"ec": "ec",
"comment(FUNCTION)": "cc_function",
"comment(KINETICS)": "kinetics",
"feature(METAL BINDING)": "ft_metal",
"feature(NP BIND)": "ft_np_bind",
"comment(PATHWAY)": "cc_pathway",
"comment(PH DEPENDENCE)": "ph_dependence",
"comment(REDOX POTENTIAL)": "redox_potential",
"rhea-id": "rhea_id",
"feature(SITE)": "ft_site",
"comment(TEMPERATURE DEPENDENCE)": "temp_dependence",
# misc
"annotation score": "annotation_score",
"comment(CAUTION)": "cc_caution",
"features": "feature",
"keyword-id": "keywordid",
"keywords": "keyword",
"comment(MISCELLANEOUS)": "cc_miscellaneous",
"existence": "protein_existence",
"reviewed": "reviewed",
"tools": "tools",
"uniparcid": "uniparc_id",
# Interaction =============================
"interactor": "cc_interaction",
"comment(SUBUNIT)": "cc_subunit",
# GO
"go": "go",
"go(biological process)": "go_p",
"go(cellular component)": "go_c",
"go(molecular function)": "go_f",
"go-id": "go_id",
# Date of
"created": "date_created",
"last-modified": "date_modified",
"sequence-modified": "date_sequence_modified",
"version(entry)": "version",
# STRUCTURE
"3d": "structure_3d",
"feature(BETA STRAND)": "ft_strand",
"feature(HELIX)": "ft_helix",
"feature(TURN)": "ft_turn",
# subcellular function
"comment(SUBCELLULAR LOCATION)": "cc_subcellular_location",
"feature(INTRAMEMBRANE)": "ft_intramem",
"feature(TOPOLOGICAL DOMAIN)": "ft_topo_dom",
"feature(TRANSMEMBRANE)": "ft_transmem",
# Pathology
"comment(ALLERGEN)": "cc_allergen",
"comment(BIOTECHNOLOGY)": "cc_biotechnology",
"comment(DISRUPTION PHENOTYPE)": "cc_disruption_phenotype",
"comment(DISEASE)": "cc_disease",
"feature(MUTAGENESIS)": "ft_mutagen",
"comment(PHARMACEUTICAL)": "cc_pharmaceutical",
"comment(TOXIC DOSE)": "cc_toxic_dose",
# PTM
"feature(CHAIN)": "ft_chain",
"feature(CROSS LINK)": "ft_crosslnk",
"feature(DISULFIDE BOND)": "ft_disulfid",
"feature(GLYCOSYLATION)": "ft_carbohyd",
"feature(INITIATOR METHIONINE)": "ft_init_met",
"feature(LIPIDATION)": "ft_lipid",
"feature(MODIFIED RESIDUE)": "ft_mod_res",
"feature(PEPTIDE)": "ft_peptide",
"comment(PTM)": "cc_ptm",
"feature(PROPEPTIDE)": "ft_propep",
"feature(SIGNAL)": "ft_signal",
"feature(TRANSIT)": "ft_transit",
# Family domains
"feature(COILED COIL)": "ft_coiled",
"feature(COMPOSITIONAL BIAS)": "ft_compbias",
"comment(DOMAIN)": "cc_domain",
"feature(DOMAIN EXTENT)": "ft_domain",
"feature(MOTIF)": "ft_motif",
"families": "protein_families",
"feature(REGION)": "ft_region",
"feature(REPEAT)": "ft_repeat",
"comment(SIMILARITY)": "<does not exist>",
"feature(ZINC FINGER)": "ft_zn_fing",
}
_valid_columns = [
# Names & Taxonomy ================================================
"accession",
"id",
"gene_names",
"gene_primary",
"gene_synonym",
"gene_oln",
"gene_orf",
"organism_name",
"organism_id",
"protein_name",
"xref_proteomes",
"lineage",
"virus_hosts",
# Sequences ========================================================
"fragment",
"sequence",
"length",
"mass",
"organelle",
"cc_alternative_products",
"error_gmodel_pred",
"cc_mass_spectrometry",
"cc_polymorphism",
"cc_rna_editing",
"cc_sequence_caution",
"ft_var_seq",
"ft_variant",
"ft_non_cons",
"ft_non_std",
"ft_non_ter",
"ft_conflict",
"ft_unsure",
"sequence_version",
# Family and Domains ========================================
"ft_coiled",
"ft_compbias",
"cc_domain",
"ft_domain",
"ft_motif",
"protein_families",
"ft_region",
"ft_repeat",
"ft_zn_fing",
# Function ===================================================
"absorption",
"ft_act_site",
"cc_activity_regulation",
"ft_binding",
# 'ft_ca_bind',
"cc_catalytic_activity",
"cc_cofactor",
"ft_dna_bind",
"ec",
"cc_function",
"kinetics",
# 'ft_metal',
# 'ft_np_bind',
"cc_pathway",
"ph_dependence",
"redox_potential",
# 'rhea_id',
"ft_site",
"temp_dependence",
# Gene Ontology ==================================
"go",
"go_p",
"go_f",
"go_c",
"go_id",
# Interaction ======================================
"cc_interaction",
"cc_subunit",
# EXPRESSION =======================================
"cc_developmental_stage",
"cc_induction",
"cc_tissue_specificity",
# Publications
"lit_pubmed_id",
# Date of
"date_created",
"date_modified",
"date_sequence_modified",
"version",
# Structure
"structure_3d",
"ft_strand",
"ft_helix",
"ft_turn",
# Subcellular location
"cc_subcellular_location",
"ft_intramem",
"ft_topo_dom",
"ft_transmem",
# Miscellaneous ==========================
"annotation_score",
"cc_caution",
"comment_count",
# "feature",
"feature_count",
"keyword",
"keywordid",
"cc_miscellaneous",
"protein_existence",
"tools",
"reviewed",
"uniparc_id",
# Pathology
"cc_allergen",
"cc_biotechnology",
"cc_disruption_phenotype",
"cc_disease",
"ft_mutagen",
"cc_pharmaceutical",
"cc_toxic_dose",
# PTM / Processsing
"ft_chain",
"ft_crosslnk",
"ft_disulfid",
"ft_carbohyd",
"ft_init_met",
"ft_lipid",
"ft_mod_res",
"ft_peptide",
"cc_ptm",
"ft_propep",
"ft_signal",
"ft_transit",
# not documented
"xref_pdb",
]
_url = "https://rest.uniprot.org"
def __init__(self, verbose=False, cache=False):
"""**Constructor**
:param verbose: set to False to prevent informative messages
:param cache: set to True to cache request
"""
self.services = REST(name="UniProt", url=UniProt._url, verbose=verbose, cache=cache, url_defined_later=True)
self.TIMEOUT = 100
self._valid_mapping = None
self._database = "uniprot"
def _download_flat_files(self, output="uniprot_sprot.dat.gz"): # pragma: no cover
"""could be used to get all data in flat files (about compressed 500Mb )"""
# deprecated in v1.10 due to API change in uniprot
url = "ftp://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/uniprot_sprot.dat.gz"
self.services.logging.info("Downloading uniprot file from the web. May take some time.:")
urllib.request.urlretrieve(url, output)
def _get_valid_mapping(self):
if not self._valid_mapping:
self._set_valid_mapping()
return self._valid_mapping
def _set_valid_mapping(self):
fields = self.services.http_get("configure/idmapping/fields")
groups = fields["groups"]
rules = {}
for item in fields["rules"]:
ID = item["ruleId"]
rules[ID] = item
# This is suppose to be a set of database name available in Uniprot
from_to = {}
for item in [x for group in groups for x in group["items"]]:
# should be name, not DisplayName
name = item["name"]
if item["from"]:
tos = rules[item["ruleId"]]["tos"]
from_to[name] = tos
self._valid_mapping = from_to
valid_mapping = property(_get_valid_mapping, _set_valid_mapping)
def _get_next_link(self, headers):
import re
re_next_link = re.compile(r'<(.+)>; rel="next"')
if "Link" in headers:
match = re_next_link.match(headers["Link"])
if match:
return match.group(1)
[docs] def mapping(
self,
fr="UniProtKB_AC-ID",
to="KEGG",
query="P13368",
polling_interval_seconds=3,
max_waiting_time=100,
progress=True,
):
"""This is an interface to the UniProt mapping service
:param fr: the source database identifier. See :attr:`valid_mapping`.
:param to: the targetted database identifier. See :attr:`valid_mapping`.
:param query: a string containing one or more IDs separated by a comma
It can also be a list of strings.
:param polling_interval_seconds: the number of seconds between each status check of the current job
:param max_waiting_time: the maximum number of seconds to wait for the final answer.
:return: a dictionary with two possible keys. The first one is 'results'
with the from / to answers and the second one 'failedIds' with Ids that were not found
::
>>> u.mapping("UniProtKB_AC-ID", "KEGG", 'P43403')
{'results': [{'from': 'P43403', 'to': 'hsa:7535'}]}
The output is a dictionary. Identifiers that were not found are stored in the keys
'failedIds'. Succesful queries are stored in the 'results' key that is a list
of dictionaries with two keys set to 'from' and 'to'. The 'from' key should be in your input list.
The 'to' key is the result. Here we have the KEGG identifier recognised by its prefix 'hsa:', which is for human.
Sometimes the output ('to') it is more complicated. Consider the following example::
u.mapping("UniParc", "UniProtKB", 'UPI0000000001,UPI0000000002')
You will see that the UniParc results is more complex than just an identifier.
See :attr:`valid_mapping` attribute for list of valid mapping identifiers.
Note that according to Uniprot (June 2022), there are various limits on ID Mapping Job Submission:
========= =====================================================================================
Limit Details
========= =====================================================================================
100,000 Total number of ids allowed in comma separated param ids in /idmapping/run api
500,000 Total number of "mapped to" ids allowed
100,000 Total number of "mapped to" ids allowed to be enriched by UniProt data
10,000 Total number of "mapped to" ids allowed with filtering
========= =====================================================================================
.. versionchanged:: 1.1.1 to return a dictionary instaed of a list
.. versionchanged:: 1.1.2 the values for each key is now made of a list
instead of strings so as to store more than one values.
.. versionchanged:: 1.2.0 input query can also be a list of strings
instead of just a string
.. versionchanged:: 1.3.1 use http_post instead of http_get. This is 3 times
faster and allows queries with more than 600 entries in one go.
.. versionchanged:: 1.10.0 new API due to uniprot website update
.. versionchanged:: 1.11.0 implement batch to prevent limit of 25 results.
"""
if isinstance(query, (list, tuple)):
query = ",".join(query)
elif isinstance(query, str):
pass
# First, we call the real mapping request
params = {"from": fr, "to": to, "ids": query}
job = self.services.http_post("idmapping/run", frmt="json", data=params)
try:
job_id = job["jobId"]
except TypeError:
logger.error(self.services.last_response.content.decode())
return
# the job id will tell us about the job status
results = None
waiting_time = 0
while not results and waiting_time < max_waiting_time:
logger.info("Waiting for {job_id} to complete")
results = self.services.http_get(f"idmapping/status/{job_id}", frmt="json")
# once ready, we can process the data
if results != 500 and "results" in results:
total = int(self.services.last_response.headers["X-Total-Results"])
batches = results["results"]
fails = results.get("failedIds", [])
size = 25
for x in tqdm.tqdm(range(size, total, size), disable=not progress):
link = self._get_next_link(self.services.last_response.headers)
batch = self.services.http_get(link, frmt="json")
batches += batch["results"]
fails += results.get("failedIds", [])
return {"results": batches, "failedIds": fails}
else: # pragma: no cover
time.sleep(polling_interval_seconds)
results = None
waiting_time += polling_interval_seconds
[docs] def retrieve(self, uniprot_id, frmt="json", database="uniprot", include=False):
"""Search for a uniprot ID in UniProtKB database
:param str uniprot: a valid UniProtKB ID, or uniref, uniparc or taxonomy.
:param str frmt: expected output format amongst xml, txt, fasta, gff, rdf
:param str database: database name in (uniprot, uniparc, uniref, taxonomy)
:param bool include: include data with RDF format.
:return: if the parameter uniprot_id is string, the output will be a a list of identifiers is provided, the output is also a list
otherwise, a string. The content of the string of items in the list
depends on the value of **frmt**.
::
>>> u = UniProt()
>>> res = u.retrieve("P09958", frmt="txt")
>>> fasta = u.retrieve(['P29317', 'Q5BKX8', 'Q8TCD6'], frmt='fasta')
>>> print(fasta[0])
.. versionchanged:: 1.10 the xml format is now returned as raw XML. It is not
interpreted anymore. The RDF has now an additional option to include data
from referenced data sets directly in the returned data (set include=True parameter).
Default output format is now set to json.
"""
if database == "uniprot":
if frmt not in ("txt", "xml", "rdf", "gff", "fasta", "json"): # pragma: no cover
self.services.logging.warning("frmt must be set to one of: txt, xml, rdf, gff, fasta, json.")
elif database == "uniparc":
if frmt not in ("xml", "rdf", "fasta", "tsv", "json"): # pragma: no cover
raise ValueError("frmt must be set to one of: tsv, xml, rdf, gff, fasta, json")
self.services.logging.warning("frmt must be set to one of: txt, xml, rdf, gff, fasta.")
elif database == "uniref":
if frmt not in ("xml", "rdf", "fasta", "tsv", "json"): # pragma: no cover
self.services.logging.warning("frmt must be set to one of: xml, rdf, gff, fasta, json.")
elif database == "taxonomy":
pass
else: # pragma: no cover
self.services.logging.warning("database must be set to uniref, uniparc, uniprot or taxonomy")
if isinstance(uniprot_id, str):
queries = uniprot_id.split(",")
else:
queries = uniprot_id
# some magic here not documented on uniprot website...but multiple queries are possible
url = [database + "/" + query + "." + frmt for query in queries]
# the frmt=txt here is for the requests, nothing related to the uniprot format
res = self.services.http_get(url, frmt="txt", params={"include": include})
if frmt == "json":
for i, x in enumerate(res):
try:
res[i] = json.loads(x)
except:
pass
if isinstance(res, list) and len(res) == 1:
res = res[0]
return res
[docs] def get_fasta(self, uniprot_id):
"""Returns FASTA string given a valid identifier
:param str uniprot_id: a valid identifier (e.g. P12345)
This is just an alias to :meth:`retrieve` when setting the format to 'fasta'.
Method kept for legacy.
"""
res = self.retrieve(uniprot_id, frmt="fasta")
return res
[docs] def search(
self,
query,
frmt="tsv",
columns=None,
include_isoforms=False,
sort="score",
compress=False,
limit=None,
size=25,
database="uniprotkb",
progress=True,
):
"""Provide some interface to the uniprot search interface.
:param str query: query must be a valid uniprot query.
See https://www.uniprot.org/help/query-fields and examples below
:param str frmt: a valid format amongst xlsx, fasta, gff,
tsv and json. OTher format are not available within bioservices (rss, obo, rdf, xml)
(default is tsv)
:param str columns: comma-separated list of values. Works only if fomat
is tsv or xlsx. For UnitProtKB, some possible columns are:
id, entry name, length, organism.
See also :attr:`~bioservices.uniprot.UniProt.valid_mapping`
for the full list of column keywords.
:param bool include_isoform: include isoform sequences when the frmt
parameter is fasta. Include description when frmt is rdf.
:param str sort: by score by default. Set to None to bypass this behaviour
:param bool compress: gzip the results
:param int limit: Stops the download of results once this limit is crossed.
if size is 25 and limit is set to 30, 25+25 results will be returned though.
users need to do a post filtering.
:param int size: chunk of results (25 by default on uniprot website).
:return: depends on the value of **frmt**. Uniprot API returns all results in several
pages with **size** elements per page. If frmt is set to xlsx, output is a list of
excel-like page with **size** per item. If frmt is set to tsv, bioservices concatenate
all pages in a single string. Similarly for **gff**, **fasta** or **json**, bioservices concatenates
all pages in a single variable (txt or dictionary depending on the requested format).
To obtain the list of uniprot ID returned by the search of zap70 can be
retrieved as follows::
>>> u.search('zap70+AND+organism_id:9606')
>>> u.search("zap70+AND+taxonomy_id:9606", frmt="tsv", limit=3,
... columns="entry_name,length,id, gene_names")
Entry name Length Entry Gene names
CBLB_HUMAN 982 Q13191 CBLB RNF56 Nbla00127
CBL_HUMAN 906 P22681 CBL CBL2 RNF55
CD3Z_HUMAN 164 P20963 CD247 CD3Z T3Z TCRZ
other examples::
>>> u.search("ZAP70+AND+organism_id:9606", limit=3, columns="id,xref_pdb")
You can also do a search on several keywords. This is especially useful
if you have a list of known entry names.::
>>> u.search("ZAP70_HUMAN+OR+CBL_HUMAN", frmt="tsv", limit=3,
... columns="entry name,length,id, genes")
Entry name Length Entry Gene names
Finally, note that when you search for a query, you may have several hits::
>>> u.search("P12345)
including the ID P12345 but also related entries. If you
need only the entry that perfectly match the query, use::
>>> u.search("accession:P12345")
This was provided from a user issue that was solved here:
https://github.com/cokelaer/bioservices/issues/122
.. warning:: some columns although valid may not return anything, not even in
the header: 'score', 'taxonomy', 'tools'. this is a uniprot feature,
not bioservices.
.. versionchanged:: 1.10
Due to uniprot API changes in June 2022:
* parameter 'include' is now named 'include_isoform
* default parameter 'tab' is now 'tsv' but does not change the results
.. versionchanged:: 1.11
* removed the offset argument
* add size parameter and keep limit parameter
* add progress bar option (True by default)
* drop frmt in : rdf, obo, xml, html
"""
params = {}
if frmt is not None:
_valid_formats = [
"xlsx",
"fasta",
"json",
"gff",
"tsv",
]
self.services.devtools.check_param_in_list(frmt, _valid_formats)
params["format"] = frmt
if columns is not None:
self.services.devtools.check_param_in_list(frmt, ["tsv", "xls"])
# remove unneeded spaces before/after commas if any
if "," in columns:
columns = [x.strip() for x in columns.split(",")]
else:
columns = [columns]
# convert back to a string as expected by uniprot
params["fields"] = ",".join([x.strip() for x in columns])
if include_isoforms is True and frmt in ["fasta", "rdf"]:
params["includeIsoform"] = "yes"
if compress is True:
params["compress"] = "yes"
if sort:
self.services.devtools.check_param_in_list(sort, ["score"])
params["sort"] = sort
if size is not None:
if isinstance(size, int):
params["size"] = size
# if a limit is provided, we will stop before getting all pages
# consequently, we want the size to be a multiple of limit.
if limit is not None:
if isinstance(size, int):
params["size"] = limit
# + are interpreted and have a meaning. See arrayexpress module for details
query = query.replace("+", " ")
params["query"] = query
del params["sort"]
res = self.services.http_get(f"{database}/search", frmt="txt", params=params)
# concatenate by removing intermediate header
if frmt in ["tsv"]:
batches = res
# concatenate
elif frmt in ["fasta", "gff"]:
batches = res
elif frmt in ["json"]:
batches = json.loads(res)
else:
batches = [res]
try:
total = int(self.services.last_response.headers["X-Total-Results"])
except KeyError:
self.services.logging.error("No results found")
return
if limit is not None:
total = limit
# start after first chunk since we already called the general search once
for x in tqdm.tqdm(range(size, total, size), disable=not progress):
link = self._get_next_link(self.services.last_response.headers)
batch = self.services.http_get(link, frmt="txt")
# drop the header for tsv
if frmt in ["tsv"]:
batch = batch.split("\n")[1:]
batches += "\n".join(batch)
elif frmt in ["fasta", "gff"]:
batches += batch
elif frmt in ["json"]:
batches["results"].extend(json.loads(batch)["results"])
else:
batches += [batch]
if frmt in ["tsv"]:
# remove last empty line
batches = batches.strip()
return batches
[docs] def quick_search(self, query, limit=1):
"""a specialised version of :meth:`search`
This is equivalent to::
u = uniprot.UniProt()
u.search(query, frmt='tsv', sort="score", limit=1)
:returns: a dictionary.
"""
res = self.search(query, "tsv", include_isoforms=False, sort="score", limit=limit, progress=False)
# if empty result, nothing to do
if res and len(res) == 0:
return res
# else populate a dictionary
newres = {}
for line in res.split("\n")[1:]:
# print line
Entry, a, b, c, d, e, f = line.split("\t")
# print Entry, a, b, c, d, e, f
newres[Entry] = {
"Entry name": a,
"Status": b,
"Protein names": c,
"Gene names": d,
"Organism": e,
"Length": f,
}
return newres
[docs] def uniref(self, query):
"""Calls UniRef service
This is an alias to :meth:`retrieve`
::
>>> u = UniProt()
>>> u.uniref("Q03063")
Another example from https://github.com/cokelaer/bioservices/issues/121
is the combination of uniprot and uniref filters::
u.uniref("uniprot:(ec:1.1.1.282 taxonomy_name:bacteria reviewed:true)")
.. versionchanged:: 1.10 due to uniprot API changes in June 2022,
we now return a json instead of a pandas dataframe.
"""
res = self.services.http_get(f"uniref/UniRef90_{query}.json", frmt="json")
return res
[docs] def get_df(self, entries, nChunk=100, organism=None, limit=10, columns=None, progress=False):
"""Given a list of uniprot entries, this method returns a dataframe with all possible columns
:param entries: list of valid entry name. if list is too large (about
>200), you need to split the list
:param chunk:
:param limit: limit number of entries per identifier to 10. You can
set it to None to keep all entries but this will be very slow
:return: dataframe with indices being the uniprot id (e.g. DIG1_YEAST)
To get about 100 columns related to the accession P62988, type:
df = u.get_df('P62988')
Note that you may preceed the accesion by the keyword **sec_acc) to access secondary
accessions numbers::
df = u.get_df('sec_acc:P62988')
"""
if isinstance(entries, str):
entries = [entries]
else:
entries = list(set(entries))
output = pd.DataFrame()
self.services.logging.info("fetching information from uniprot for {} entries".format(len(entries)))
nChunk = min(nChunk, len(entries))
N, rest = divmod(len(entries), nChunk)
for i in range(0, N + 1):
this_entries = entries[i * nChunk : (i + 1) * nChunk]
if len(this_entries):
self.services.logging.info("uniprot.get_df {}/{}".format(i + 1, N))
query = "+OR+".join(this_entries)
if organism:
query += f"+AND+{organism}"
res = self.search(
query, frmt="tsv", columns=",".join(self._valid_columns), limit=limit, progress=progress
)
else:
break
if len(res) == 0:
self.services.logging.warning("some entries %s not found" % entries)
else:
df = pd.read_csv(io.StringIO(str(res)), sep="\t")
if isinstance(output, type(None)):
output = df.copy()
else:
# output = output.append(df, ignore_index=True)
output = pd.concat([output, df], ignore_index=True)
# you may end up with duplicated...
output.drop_duplicates(inplace=True)
# Sequences are splitted into chunks of 10 characters. let us remove
# the spaces:
if "sequence" in output.columns:
output["sequence"].fillna("", inplace=True)
output.Sequence = output["sequence"].apply(lambda x: x.replace(" ", ""))
if columns:
output = output[columns.split(",")]
return output