Source code for app.components.api_tools.uniprot

"""
UniProt data access and utilities.

This module provides helpers to download and process UniProtKB data via
the public REST API, including pagination handling, column mapping,
organism filtering, and convenience wrappers for common datasets used in
ProteoGyver.

Main entry points
-----------------
- ``download_uniprot_chunks``: batched download for an organism/field set
- ``download_full_uniprot_for_organism``: comprehensive download builder
- ``retrieve_uniprot``: cache-aware retrieval to a local TSV
- ``update``: check and refresh local UniProt dataset for an organism

Notes
-----
Large result sets are fetched using server-side pagination with retry
policies and optional progress reporting.
"""
from datetime import datetime
import sys
import os
import re
import warnings
import time
import requests
import pandas as pd
from requests.adapters import HTTPAdapter, Retry
from requests.exceptions import ChunkedEncodingError, ConnectionError, RequestException
import json
from . import apitools


# TODO: move to a separate file

[docs]
def get_uniprot_column_map() -> dict:
    """
    Returns currently understood map of uniprot columns in a dict.
    """
    return {'Entry': 'accession',
            'Absorption': 'absorption',
            'Annotation': 'annotation_score',
            'Activity regulation': 'cc_activity_regulation',
            'Allergenic properties': 'cc_allergen',
            'Alternative products': 'cc_alternative_products',
            'Biotechnological use': 'cc_biotechnology',
            'Catalytic activity': 'cc_catalytic_activity',
            'Caution': 'cc_caution',
            'Cofactor': 'cc_cofactor',
            'Developmental stage': 'cc_developmental_stage',
            'Involvement in disease': 'cc_disease',
            'Disruption phenotype': 'cc_disruption_phenotype',
            'Domain[CC]': 'cc_domain',
            'Function [CC]': 'cc_function',
            'Induction': 'cc_induction',
            'Interacts with': 'cc_interaction',
            'Mass spectrometry': 'cc_mass_spectrometry',
            'Miscellaneous [CC]': 'cc_miscellaneous',
            'Pathway': 'cc_pathway',
            'Pharmaceutical use': 'cc_pharmaceutical',
            'Polymorphism': 'cc_polymorphism',
            'Post-translational modification': 'cc_ptm',
            'RNA editing': 'cc_rna_editing',
            'Sequence caution': 'cc_sequence_caution',
            'Subcellular location[CC]': 'cc_subcellular_location',
            'Subunit structure[CC]': 'cc_subunit',
            'Tissue specificity': 'cc_tissue_specificity',
            'Toxic dose': 'cc_toxic_dose',
            'Comment Count': 'comment_count',
            'Date of creation': 'date_created',
            'Date of last modification': 'date_modified',
            'Date of last sequence modification': 'date_sequence_modified',
            'EC number': 'ec',
            'Erroneous gene model prediction': 'error_gmodel_pred',
            'Features': 'feature_count',
            'Fragment': 'fragment',
            'Active site': 'ft_act_site',
            'Binding site': 'ft_binding',
            # this field name causes problems 2022-08-22
            # 'Calcium binding': 'ft_ca_bind',
            'Glycosylation': 'ft_carbohyd',
            'Chain': 'ft_chain',
            'Coiled coil': 'ft_coiled',
            'Compositional bias': 'ft_compbias',
            'Sequence conflict': 'ft_conflict',
            'Cross-link': 'ft_crosslnk',
            'Disulfide bond': 'ft_disulfid',
            'DNA binding': 'ft_dna_bind',
            'Domain[FT]': 'ft_domain',
            'Helix': 'ft_helix',
            'Initiator methionine': 'ft_init_met',
            'Intramembrane': 'ft_intramem',
            'Lipidation': 'ft_lipid',
            # this field name causes problems 2022-08-22
            # 'Metal binding': 'ft_metal',
            'Modified residue': 'ft_mod_res',
            'Motif': 'ft_motif',
            'Mutagenesis': 'ft_mutagen',
            'Non-adjacent residues': 'ft_non_cons',
            'Non-standard residue': 'ft_non_std',
            'Non-terminal residue': 'ft_non_ter',
            # this field name causes problems 2022-08-22
            # 'Nucleotide binding': 'ft_np_bind',
            'Peptide': 'ft_peptide',
            'Propeptide': 'ft_propep',
            'Region': 'ft_region',
            'Repeat': 'ft_repeat',
            'Signal peptide': 'ft_signal',
            'Site': 'ft_site',
            'Beta strand': 'ft_strand',
            'Topological domain': 'ft_topo_dom',
            'Transit peptide': 'ft_transit',
            'Transmembrane': 'ft_transmem',
            'Turn': 'ft_turn',
            'Sequence uncertainty': 'ft_unsure',
            'Alternative sequence': 'ft_var_seq',
            'Natural variant': 'ft_variant',
            'Zinc finger': 'ft_zn_fing',
            'Gene names': 'gene_names',
            'Gene names (ordered locus)': 'gene_oln',
            'Gene names (ORF)': 'gene_orf',
            'Gene names (primary)': 'gene_primary',
            'Gene names (synonym)': 'gene_synonym',
            'Gene ontology (GO)': 'go',
            'Gene ontology (cellular component)': 'go_c',
            'Gene ontology (molecular function)': 'go_f',
            'Gene ontology IDs': 'go_id',
            'Gene ontology (biological process)': 'go_p',
            'Entry name': 'id',
            'Keywords': 'keyword',
            'Keyword ID': 'keywordid',
            'Kinetics': 'kinetics',
            'Length': 'length',
            'Taxonomic lineage': 'lineage',
            'Taxonomic lineage (IDs)': 'lineage_ids',
            'PubMed ID': 'lit_pubmed_id',
            'Mass': 'mass',
            'Gene encoded by': 'organelle',
            'Organism ID': 'organism_id',
            'Organism': 'organism_name',
            'pH dependence': 'ph_dependence',
            'Protein existence': 'protein_existence',
            'Protein families': 'protein_families',
            'Protein names': 'protein_name',
            'Redox potential': 'redox_potential',
            'Reviewed': 'reviewed',
            'Rhea ID': 'rhea',
            'Sequence': 'sequence',
            'Sequence version': 'sequence_version',
            '3D': 'structure_3d',
            'Temperature dependence': 'temp_dependence',
            'Tools': 'tools',
            'UniParc': 'uniparc_id',
            'Entry version': 'version',
            'Virus hosts': 'virus_hosts',
            'ABCD': 'xref_abcd',
            'Allergome': 'xref_allergome',
            'AlphaFoldDB': 'xref_alphafolddb',
            'Antibodypedia': 'xref_antibodypedia',
            'ArachnoServer': 'xref_arachnoserver',
            'Araport': 'xref_araport',
            'Bgee': 'xref_bgee',
            'BindingDB': 'xref_bindingdb',
            'BioCyc': 'xref_biocyc',
            'BioGRID': 'xref_biogrid',
            'BioGRID-ORCS': 'xref_biogrid-orcs',
            'BioMuta': 'xref_biomuta',
            'BMRB': 'xref_bmrb',
            'BRENDA': 'xref_brenda',
            'CarbonylDB': 'xref_carbonyldb',
            'CAZy': 'xref_cazy',
            'CCDS': 'xref_ccds',
            'CDD': 'xref_cdd',
            'CGD': 'xref_cgd',
            'ChEMBL': 'xref_chembl',
            'ChiTaRS': 'xref_chitars',
            'CLAE': 'xref_clae',
            'CleanEx': 'xref_cleanex',
            'CollecTF': 'xref_collectf',
            'ComplexPortal': 'xref_complexportal',
            'COMPLUYEAST-2DPAGE': 'xref_compluyeast-2dpage',
            'ConoServer': 'xref_conoserver',
            'CORUM': 'xref_corum',
            'CPTAC': 'xref_cptac',
            'CPTC': 'xref_cptc',
            'CTD': 'xref_ctd',
            'dbSNP': 'xref_dbsnp',
            'DEPOD': 'xref_depod',
            'dictyBase': 'xref_dictybase',
            'DIP': 'xref_dip',
            'DisGeNET': 'xref_disgenet',
            'DisProt': 'xref_disprot',
            'DMDM': 'xref_dmdm',
            'DNASU': 'xref_dnasu',
            'DOSAC-COBS-2DPAGE': 'xref_dosac-cobs-2dpage',
            'DrugBank': 'xref_drugbank',
            'DrugCentral': 'xref_drugcentral',
            'EchoBASE': 'xref_echobase',
            'eggNOG': 'xref_eggnog',
            'ELM': 'xref_elm',
            'EMBL': 'xref_embl',
            'Ensembl': 'xref_ensembl',
            'EnsemblBacteria': 'xref_ensemblbacteria',
            'EnsemblFungi': 'xref_ensemblfungi',
            'EnsemblMetazoa': 'xref_ensemblmetazoa',
            'EnsemblPlants': 'xref_ensemblplants',
            'EnsemblProtists': 'xref_ensemblprotists',
            'EPD': 'xref_epd',
            'ESTHER': 'xref_esther',
            'euHCVdb': 'xref_euhcvdb',
            'EvolutionaryTrace': 'xref_evolutionarytrace',
            'ExpressionAtlas': 'xref_expressionatlas',
            'FlyBase': 'xref_flybase',
            'Gene3D': 'xref_gene3d',
            'GeneCards': 'xref_genecards',
            'GeneID': 'xref_geneid',
            'GeneReviews': 'xref_genereviews',
            'GeneTree': 'xref_genetree',
            'Genevisible': 'xref_genevisible',
            'GeneWiki': 'xref_genewiki',
            'GenomeRNAi': 'xref_genomernai',
            'GlyConnect': 'xref_glyconnect',
            'GlyGen': 'xref_glygen',
            'Gramene': 'xref_gramene',
            'GuidetoPHARMACOLOGY': 'xref_guidetopharmacology',
            'HAMAP': 'xref_hamap',
            'HGNC': 'xref_hgnc',
            'HOGENOM': 'xref_hogenom',
            'HPA': 'xref_hpa',
            'IDEAL': 'xref_ideal',
            'IMGT_GENE-DB': 'xref_imgt_gene-db',
            'InParanoid': 'xref_inparanoid',
            'IntAct': 'xref_intact',
            'InterPro': 'xref_interpro',
            'iPTMnet': 'xref_iptmnet',
            'jPOST': 'xref_jpost',
            'KEGG': 'xref_kegg',
            'KO': 'xref_ko',
            'LegioList': 'xref_legiolist',
            'Leproma': 'xref_leproma',
            'MaizeGDB': 'xref_maizegdb',
            'MalaCards': 'xref_malacards',
            'MANE-Select': 'xref_mane-select',
            'MassIVE': 'xref_massive',
            'MaxQB': 'xref_maxqb',
            'MEROPS': 'xref_merops',
            'MetOSite': 'xref_metosite',
            'MGI': 'xref_mgi',
            'MIM': 'xref_mim',
            'MINT': 'xref_mint',
            'MoonDB': 'xref_moondb',
            'MoonProt': 'xref_moonprot',
            'neXtProt': 'xref_nextprot',
            'NIAGADS': 'xref_niagads',
            'OGP': 'xref_ogp',
            'OMA': 'xref_oma',
            'OpenTargets': 'xref_opentargets',
            'Orphanet': 'xref_orphanet',
            'OrthoDB': 'xref_orthodb',
            'PANTHER': 'xref_panther',
            'PathwayCommons': 'xref_pathwaycommons',
            'PATRIC': 'xref_patric',
            'PaxDb': 'xref_paxdb',
            'PCDDB': 'xref_pcddb',
            'PDB': 'xref_pdb',
            'PDBsum': 'xref_pdbsum',
            'PeptideAtlas': 'xref_peptideatlas',
            'PeroxiBase': 'xref_peroxibase',
            'Pfam': 'xref_pfam',
            'PharmGKB': 'xref_pharmgkb',
            'Pharos': 'xref_pharos',
            'PHI-base': 'xref_phi-base',
            'PhosphoSitePlus': 'xref_phosphositeplus',
            'PhylomeDB': 'xref_phylomedb',
            'PIR': 'xref_pir',
            'PIRSF': 'xref_pirsf',
            'PlantReactome': 'xref_plantreactome',
            'PomBase': 'xref_pombase',
            'PRIDE': 'xref_pride',
            'PRINTS': 'xref_prints',
            'PRO': 'xref_pro',
            'ProDom': 'xref_prodom',
            'ProMEX': 'xref_promex',
            'PROSITE': 'xref_prosite',
            'Proteomes': 'xref_proteomes',
            'ProteomicsDB': 'xref_proteomicsdb',
            'PseudoCAP': 'xref_pseudocap',
            'Reactome': 'xref_reactome',
            'REBASE': 'xref_rebase',
            'RefSeq': 'xref_refseq',
            'REPRODUCTION-2DPAGE': 'xref_reproduction-2dpage',
            'RGD': 'xref_rgd',
            'RNAct': 'xref_rnact',
            'SABIO-RK': 'xref_sabio-rk',
            'SASBDB': 'xref_sasbdb',
            'SFLD': 'xref_sfld',
            'SGD': 'xref_sgd',
            'SignaLink': 'xref_signalink',
            'SIGNOR': 'xref_signor',
            'SMART': 'xref_smart',
            'SMR': 'xref_smr',
            'STRING': 'xref_string',
            'SUPFAM': 'xref_supfam',
            'SWISS-2DPAGE': 'xref_swiss-2dpage',
            'SwissLipids': 'xref_swisslipids',
            'SwissPalm': 'xref_swisspalm',
            'TAIR': 'xref_tair',
            'TCDB': 'xref_tcdb',
            'TIGRFAMs': 'xref_tigrfams',
            'TopDownProteomics': 'xref_topdownproteomics',
            'TreeFam': 'xref_treefam',
            'TubercuList': 'xref_tuberculist',
            'UCD-2DPAGE': 'xref_ucd-2dpage',
            'UCSC': 'xref_ucsc',
            'UniCarbKB': 'xref_unicarbkb',
            'UniLectin': 'xref_unilectin',
            'UniPathway': 'xref_unipathway',
            'VectorBase': 'xref_vectorbase',
            'VEuPathDB': 'xref_veupathdb',
            'VGNC': 'xref_vgnc',
            'WBParaSite': 'xref_wbparasite',
            'WBParaSiteTranscriptProtein': 'xref_wbparasitetranscriptprotein',
            'World-2DPAGE': 'xref_world-2dpage',
            'WormBase': 'xref_wormbase',
            'Xenbase': 'xref_xenbase',
            'ZFIN': 'xref_zfin'
            }



[docs]
def get_default_uniprot_column_map() -> dict:
    """
    Returns a sensible, yet extensive default map of uniprot columns in a dict.
    """
    return {'Entry': 'accession',
            'Post-translational modification': 'cc_ptm',
            'Comment Count': 'comment_count',
            'Organism': 'organism_name',
            'Modified residue': 'ft_mod_res',
            'Gene names': 'gene_names',
            'Gene names (primary)': 'gene_primary',
            'Gene ontology (cellular component)': 'go_c',
            'Gene ontology (molecular function)': 'go_f',
            'Gene ontology (biological process)': 'go_p',
            'Entry name': 'id',
            'Keywords': 'keyword',
            'Length': 'length',
            'PubMed ID': 'lit_pubmed_id',
            'Mass': 'mass',
            'Protein names': 'protein_name',
            'Reviewed': 'reviewed',
            'Sequence': 'sequence',
            'Entry version': 'version',
            'Reactome': 'xref_reactome',
            'Cofactor': 'cc_cofactor',
            'Developmental stage': 'cc_developmental_stage',
            'Involvement in disease': 'cc_disease',
            'Disruption phenotype': 'cc_disruption_phenotype',
            'Domain[CC]': 'cc_domain',
            'Function [CC]': 'cc_function',
            'Induction': 'cc_induction',
            'Interacts with': 'cc_interaction',
            'Miscellaneous [CC]': 'cc_miscellaneous',
            'Sequence caution': 'cc_sequence_caution',
            'Subcellular location[CC]': 'cc_subcellular_location',
            'Date of last sequence modification': 'date_sequence_modified',
            'Organism ID': 'organism_id',
            'PhosphoSitePlus': 'xref_phosphositeplus',
            'Sequence version': 'sequence_version',
            'RefSeq': 'xref_refseq',
            'STRING': 'xref_string',
            'Ensembl': 'xref_ensembl',
            'IntAct': 'xref_intact',
            'InterPro': 'xref_interpro',
            'PANTHER': 'xref_panther',
            'SUPFAM': 'xref_supfam',
            'Pfam': 'xref_pfam',
            'PRO': 'xref_pro',
            'PROSITE': 'xref_prosite',
            }



def __get_uniprot_batch(batch_url: str | None, session: requests.Session, max_retries: int = 5, initial_backoff: float = 1.0) -> tuple: #TODO: mark all tuple returns with Generator class once available
    """
    Retrieves batches from UniProt, results response, and total number of results.
    Implements retry logic for connection errors with exponential backoff.

    :param batch_url: URL for the batch request.
    :param session: Requests session object.
    :param max_retries: Maximum number of retry attempts for connection errors.
    :param initial_backoff: Initial backoff time in seconds (doubles on each retry).
    :yields: a tuple of (requests.models.Response, total:str)
    """
    while batch_url:
        retry_count = 0
        backoff = initial_backoff
        
        while retry_count <= max_retries:
            try:
                response = session.get(batch_url, timeout=300)  # 5 minute timeout
                response.raise_for_status()
                # Force reading the content to catch ChunkedEncodingError early
                # This ensures the full response is downloaded before we yield it
                _ = response.content  # This will raise ChunkedEncodingError if connection fails
                try:
                    total = response.headers["x-total-results"]
                except KeyError:
                    total = 1
                yield (response, total)
                next_url = __get_uniprot_next_link(response.headers)
                batch_url = next_url if next_url else None
                break  # Success, exit retry loop
            except (ChunkedEncodingError, ConnectionError, RequestException) as e:
                retry_count += 1
                if retry_count > max_retries:
                    raise RuntimeError(
                        f"Failed to download UniProt batch after {max_retries} retries. "
                        f"Last error: {type(e).__name__}: {str(e)}"
                    ) from e
                warnings.warn(
                    f"Connection error downloading UniProt batch (attempt {retry_count}/{max_retries}): {str(e)}. "
                    f"Retrying in {backoff:.1f} seconds...",
                    UserWarning
                )
                time.sleep(backoff)
                backoff *= 2  # Exponential backoff


def __get_uniprot_next_link(headers) -> str|None:
    """
    Parses link to the next page of results in a UniProt query

    :param headers: dictionary of html headers.

    :returns: The link, or None is no link is found.
    """
    re_next_link = re.compile(r'<(.+)>; rel="next"')
    if "Link" in headers:
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)
    return None


[docs]
def download_uniprot_for_database(versions: list[str], organisms: set|None) -> tuple[pd.DataFrame, list[str]]:
    """
    Checks, whether an update for uniprot is available and downloads it if necessary for database use.

    :param versions: List of current versions of the UniProt database.
    :param organisms: Optional set of organism IDs to include.
    :param progress: True, if progress should be printed.

    :returns: Tuple of (DataFrame of UniProt data, List of new versions).
    """
    dfs = []
    db_fields = [
        "Entry",
        "Reviewed",
        "Gene names (primary)",
        "Entry name",
        "Gene names",
        "Organism",
        "Length",
        "Sequence"
    ]

    if organisms is None:
        organisms = {-1}
    else:
        organisms = {int(i) for i in organisms}
    org_versions = {}
    for v in versions:
        org, ver = v.split('::')
        org_versions[org] = ver
    ret_vers = []
    for organism in organisms:
        if organism in org_versions:
            old_ver = org_versions[organism]
        else:
            old_ver = 'no version'
        available, new_version = is_newer_available(old_ver, organism)
        if available:
            dfs.append(download_uniprot_chunks(progress=True, organism=organism, fields = db_fields, reviewed_only=True))
        ret_vers.append(f'{organism}::{new_version}')
    return pd.concat(dfs), ret_vers





[docs]
def download_uniprot_chunks(progress: bool = False, organism: int = -1,
                            fields: list|None = None, reviewed_only: bool = True) -> pd.DataFrame:
    """
    Downloads whole uniprot for a given organism using pagination.

    Entry -column will always be the first column and used as the index in the output dataframe.

    :param progress: Progress report printing
    :param organism: human by default, otherwise specify organism ID (e.g. human is 9606). if -1, all organisms will be downloaded
    :param fields: uniprot field labels for fields to retrieve. Refer to \
        https://www.uniprot.org/help/return_fields for help with field Labels \
        (from the label column). If None, download a default selection.
    :param reviewed_only: if True, only reviewed entries will be returned. if False, all entries will be returned (probably not what you want).
    
    :returns: the requested uniprot data in a pandas dataframe.
    """
    if organism > 0:
        taxonomy = f'%28taxonomy_id%3A{organism}'
        if reviewed_only:
            taxonomy += '%29%20AND%20'
    else:
        taxonomy = ''
    if reviewed_only:
        reviewed:str = '%28reviewed%3Atrue%29'
    else:
        reviewed = ''
    filter_str = f'{taxonomy}{reviewed}'
    if (organism > 0) or reviewed_only:
        filter_str = f'query={filter_str}&'
    fieldstr: list = []
    headers: list = []
    output_format: str = 'tsv'
    if not fields:
        field_dict: dict = get_default_uniprot_column_map()
        fields = [
            'Entry', 'Entry version', 'Reactome', 'Entry name', 'Gene names',
            'Protein names', 'Reviewed', 'Organism', 'Comment Count', 'Keywords',
            'Modified residue', 'PubMed ID', 'Post-translational modification',
            'Gene ontology (cellular component)', 'Gene ontology (molecular function)',
            'Gene ontology (biological process)', 'Length', 'Mass', 'Sequence'
        ]
        for field in fields:
            fieldstr.append(field_dict[field])
            headers.append(field)
        have: set = set(headers)
        for name, field in field_dict.items():
            if name not in have:
                fieldstr.append(field)
                headers.append(name)
    else:
        field_dict = get_uniprot_column_map()
        fieldstr.append(field_dict['Entry'])
        headers.append('Entry')
        for field in fields:
            if field == 'Entry':
                continue
            if field not in field_dict:
                warnings.warn((
                    f'Field {field} not found in UniProt. Refer to '
                    f'https://www.uniprot.org/help/return_fields for help with field Labels.'
                ), UserWarning)
            else:
                fieldstr.append(field_dict[field])
                headers.append(field)
    assert len(fieldstr) > 0, (
        'Empty UniProt label list. Refer to '
        'https://www.uniprot.org/help/return_fields for help with field Labels.'
    )
    final_fieldstr: str = '%2C'.join(fieldstr)
    pagination_url: str = (
        f'https://rest.uniprot.org/uniprotkb/search?fields={final_fieldstr}&format={output_format}&'
        f'{filter_str}size=500'
    )
    return download_uniprot_pagination_url(pagination_url, headers, progress)


def __format_seconds_into_estimate(total_seconds: int, append:str = ''):
    hours, remainder = divmod(total_seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"estimated: {hours}:{minutes:02}:{seconds:02}{append}"


[docs]
def download_uniprot_pagination_url(pag_url: str, headers: list, progress:bool) -> pd.DataFrame:
    """
    Handles the downloading of the uniprot

    :param pag_url: url for uniprot
    :param headers: list of headers to get
    :param progress: True, if progress should be printed

    :returns: Pandas dataframe of the uniprot specified by url and headers.
    """
    retries: Retry = Retry(total=5, backoff_factor=0.25,
                    status_forcelist=[500, 502, 503, 504])
    session: requests.Session = requests.Session()
    session.mount("https://", HTTPAdapter(max_retries=retries))

    alltext: list = []
    index: list = []
    times_taken: list = []
    prev_time = datetime.now()
    for batch, total in __get_uniprot_batch(pag_url, session):
        for line in batch.text.splitlines()[1:]:
            line: list = line.split('\t')
            alltext.append(line[1:])
            index.append(line[0])
        if progress:
            time_taken = int((datetime.now() - prev_time).total_seconds())
            if time_taken > 0:
                times_taken.append(time_taken)
                est_time = int(((int(total)-len(alltext))/500)*(sum(times_taken[-5:])/len(times_taken[-5:])))
                estimate = __format_seconds_into_estimate(est_time, append = ' to finish')
            else:
                estimate = ''
            print(f'{len(alltext)} / {total}, took {time_taken} seconds, {estimate}')
            prev_time = datetime.now()
    return pd.DataFrame(columns=headers[1:], data=alltext, index=pd.Series(index, name='Entry'))



[docs]
def retrieve_protein_group(name:str, query_col:str = 'protein_name', reviewed:bool = True) -> pd.DataFrame:
    """
    Utility function to quickly download tsvs describing each common protein class used in ProteoGyver.
    
    :param name: protein group name to search for
    :param query_col: column to search. Protein name by default.
    
    :returns: pandas datafrmae of the given protein group
    """
    headers = [
        'Entry',
        'Reviewed',
        'Entry name',
        'Protein names',
        'Gene names',
        'Organism',
        'Length',
        'Gene names (primary)'
    ]
    colmap = get_uniprot_column_map()
    field_str = '%2C'.join([colmap[h] for h in headers])
    
    name = name.lower().replace(' ','+')
    if reviewed:
        revstr:str = '+AND+%28reviewed%3Atrue%29'
    else:
        revstr = ''
    group_url = f'https://rest.uniprot.org/uniprotkb/search?fields={field_str}&format=tsv&query=%28{query_col}%3A{name}%29{revstr}&size=500'
    return download_uniprot_pagination_url(group_url, headers, False)



[docs]
def retrieve_uniprot(uniprotfile: str = 'Full human uniprot.tsv', **kwargs) -> pd.DataFrame:
    """
    Downloads full uniprot (reviewed entries only) to a file and returns the dataframe.

    :param uniprotfile: path of the output file
    :param kwargs: kwargs to pass down to download_full_uniprot_for_organism, e.g. to specify which\
        organism or if progress should be reported.
    
    :returns: full uniprot as a pandas dataframe.
    """
    if os.path.isfile(uniprotfile):
        updf:pd.DataFrame = pd.read_csv(uniprotfile, sep='\t')
    else:
        updf = download_full_uniprot_for_organism(**kwargs)
        updf.to_csv(uniprotfile, sep='\t',encoding = 'utf-8')
    updf.index = updf['Entry']
    updf = updf.drop(columns=['Entry'])
    return updf



[docs]
def download_full_uniprot_for_organism(organism: int = None, # TODO: mark this as Union[list, int] when newer version of python available
                                       columns: set = None, progress: bool = False,
                                       overall_progress: bool = False, reviewed_only:bool = True) -> pd.DataFrame:
    """
    Downloads the full uniprot database EXCLUDING isoforms in a .tsv format for a \
        given organism.

    :param organism: integer ID or a list of integer ID of the desired organism, e.g. human is 9606. If none, defaults to human.
    :param columns: a set of uniprot columns to get
    :param progress: Print progress reports of how each batch download is going
    :param overall_progress: Print progress reports when each batch is finished
    :param reviewed only: True, if only reviewed entries should be retrieved

    :returns: pandas dataframe of the uniprot.
    """
    if not organism:
        organism: list = [9606]
    elif isinstance(organism, int):
        organism: list = [organism]
    dfs_to_merge: list = []
    if not columns:
        columns: set = set(get_default_uniprot_column_map().keys())
    need: set = columns
    lim: int = 40
    next_batch: list = ['Entry']
    while len(need) > 0:
        if overall_progress:
            print(f'{len(need)} columns to go in {1+int(len(need)/lim)} batches.')
        next_batch.extend(list(need)[:(lim-1)])
        for organism_id in organism:
            dfs_to_merge.append(download_uniprot_chunks(fields=next_batch,
                                                        organism=organism_id, progress=progress, reviewed_only=reviewed_only))
        need = {k: v for k, v in need.items() if k not in next_batch}
        next_batch = ['Entry']
    return pd.concat(dfs_to_merge, axis=1)



[docs]
def update(version: str, organism = 9606,progress=False) -> None:
    """
    Checks, whether an update for uniprot is available and downloads it if necessary.

    :param organism: which organism to download
    :param progress: True, if progress should be printed.
    """
    outdir: str = apitools.get_save_location('Uniprot')
    if is_newer_available(version, organism)[0]:
        today: str = apitools.get_timestamp()
        df: pd.DataFrame = download_full_uniprot_for_organism(organism=organism,progress=progress,overall_progress=progress)
        outfile: str = os.path.join(outdir, f'{today}_Uniprot_{organism}')
        df.to_csv(f'{outfile}.tsv',encoding = 'utf-8',sep='\t')
        name_dict:dict = {}
        length_dict:dict = {}
        for _,row in df.iterrows():
            name_dict[row['Entry']] = row['Gene names (primary)']
            length_dict[row['Entry']] = row['Length']
        with open(f'{outfile}_protein_names.json','w',encoding='utf-8') as fil:
            json.dump(name_dict,fil)
        with open(f'{outfile}_protein_lengths.json','w',encoding='utf-8') as fil:
            json.dump(length_dict,fil)




[docs]
def is_newer_available(current_version: str, organism: int = 9606) -> tuple[bool, str]:
    """
    Checks whether newer uniprot version is available

    :param current_version: Current version of the uniprot database
    :param organism: which organism uniprot to check.

    :returns: True, if newer uniprot version is available.
    """
    uniprot_url: str = f"https://rest.uniprot.org/uniprotkb/search?\
        query=organism_id:{organism}&format=fasta"
    uniprot_response: requests.Response = requests.get(uniprot_url)
    newest_version:str = uniprot_response.headers['X-UniProt-Release'].replace('_','-')
    if current_version != newest_version:
        return (True, newest_version)
    else:
        return (False, current_version)



[docs]
def methods_text(organism=9606) -> tuple:
    """
    Generates a methods text for used uniprot data
    
    :returns: a tuple of (readable reference information (str), PMID (str), uniprot description (str))
    """
    short:str
    long: str
    pmid: str
    short,long,pmid = apitools.get_pub_ref('uniprot')
    return '\n'.join([
        f'Protein annotations were mapped from UniProt (https://uniprot.org) {short}',
        pmid,
        long
    ])