CellMarker 2.0#

The underlying curation process for bionty.CellMarker.df

import pandas as pd
import numpy as np
url = "http://bio-bigdata.hrbmu.edu.cn/CellMarker/CellMarker_download_files/file/Cell_marker_All.xlsx"
def preprocess(url: str, species: str):
    df_cm = pd.read_excel(url, dtype=str)
    df_cm = df_cm[df_cm["species"] == species.capitalize()].copy()
    print(f"Original shape: {df_cm.shape}")
    display(df_cm.head())
    # Drop tissue, cell type, tech, journal
    df = df_cm[["marker", "GeneID", "Symbol", "Genename", "UNIPROTID"]].copy()
    df = df.drop_duplicates(subset=["marker", "GeneID", "UNIPROTID"])
    df = df.drop_duplicates(subset=["marker"])
    display(df.head())
    df.rename(
        columns={
            "marker": "name",
            "GeneID": "ncbi_gene_id",
            "Symbol": "gene_symbol",
            "Genename": "gene_name",
            "UNIPROTID": "uniprotkb_id",
        },
        inplace=True,
    )
    # insert an id column
    df.insert(0, "id", "CM_" + df["name"].astype(str))
    df = df[df["name"].notnull()].copy()
    # clean up index
    df.reset_index(inplace=True)
    df.pop("index")
    print(f"Unique shape: {df.shape}")
    display(df.head())

    # group synonyms
    grouped_dfs, df_clean = group_alias_rows(df)
    agg_synonyms(grouped_dfs, df_clean)

    # clean up index again
    df_clean.reset_index(inplace=True)
    df_clean.pop("index")
    print(f"After synonyms aggregation: {df_clean.shape}")
    display(df_clean.head())

    return df_clean


def group_alias_rows(df):
    allnames = df["name"].unique()
    restnames = set(allnames)

    grouped_dfs = {}

    for name in allnames:
        if any([i in name for i in ["α", "β", "γ"]]):
            continue
        if "-" in name or any([i in name for i in ["alpha", "beta", "gamma"]]):
            names = set()
            names.add(name)
            if any([i in name for i in ["alpha", "beta", "gamma"]]):
                name = (
                    name.replace("alpha", "α")
                    .replace("beta", "β")
                    .replace("gamma", "γ")
                )
            names.add(name)
            names.add(name.replace("-", ""))
            names.add(name.replace(" ", ""))
            names.add(name.replace("-", "").replace(" ", ""))
            names.add(name.upper())
            names.add(name.upper().replace("-", ""))
            names.add(name.upper().replace(" ", ""))
            names.add(name.upper().replace("-", "").replace(" ", ""))
            names.add(name.capitalize())
            names.add(name.capitalize().replace("-", ""))
            names.add(name.capitalize().replace(" ", ""))
            names.add(name.capitalize().replace("-", "").replace(" ", ""))

            ints_names = names.intersection(restnames)

            if len(ints_names) > 1:
                names_df = df[df["name"].isin(ints_names)].copy()
                grouped_dfs[name] = names_df
                for n in names_df["name"]:
                    restnames.remove(n)

    df_clean = df[df["name"].isin(restnames)].copy()

    return grouped_dfs, df_clean


def agg_synonyms(grouped_dfs, df_clean):
    df_clean["synonyms"] = np.nan

    for _, df in grouped_dfs.items():
        df_not_na = df[~df["ncbi_gene_id"].isna()]
        if len(df_not_na) > 0:
            append_row = df_not_na.iloc[0]
        else:
            append_row = df.iloc[0]
        index_name = append_row["name"]
        synonyms = df[df["name"] != index_name]["name"].unique()
        if len(synonyms) > 1:
            synonyms_joined = "|".join(synonyms)
        else:
            synonyms_joined = synonyms[0]
        append_row["synonyms"] = synonyms_joined
        df_clean.loc[df_clean.index[-1] + 1] = append_row

Human#

df_human = preprocess(url, species="human")
Original shape: (60877, 20)
species tissue_class tissue_type uberonongology_id cancer_type cell_type cell_name cellontology_id marker Symbol GeneID Genetype Genename UNIPROTID technology_seq marker_source PMID Title journal year
0 Human Abdomen Abdomen UBERON_0000916 Normal Normal cell Macrophage CL_0000235 MERTK MERTK 10461 protein_coding MER proto-oncogene, tyrosine kinase Q12866 NaN Experiment 31982413 Peritoneal Level of CD206 Associates With Mort... Gastroenterology 2020
1 Human Abdomen Abdomen UBERON_0000916 Normal Normal cell Macrophage CL_0000235 CD16 FCGR3A 2215 protein_coding Fc fragment of IgG receptor IIIb O75015 NaN Experiment 31982413 Peritoneal Level of CD206 Associates With Mort... Gastroenterology 2020
2 Human Abdomen Abdomen UBERON_0000916 Normal Normal cell Macrophage CL_0000235 CD206 MRC1 4360 protein_coding mannose receptor C-type 1 P22897 NaN Experiment 31982413 Peritoneal Level of CD206 Associates With Mort... Gastroenterology 2020
3 Human Abdomen Abdomen UBERON_0000916 Normal Normal cell Macrophage CL_0000235 CRIg VSIG4 11326 protein_coding V-set and immunoglobulin domain containing 4 Q9Y279 NaN Experiment 31982413 Peritoneal Level of CD206 Associates With Mort... Gastroenterology 2020
4 Human Abdomen Abdomen UBERON_0000916 Normal Normal cell Macrophage CL_0000235 CD163 CD163 9332 protein_coding CD163 molecule Q86VB7 NaN Experiment 31982413 Peritoneal Level of CD206 Associates With Mort... Gastroenterology 2020
marker GeneID Symbol Genename UNIPROTID
0 MERTK 10461 MERTK MER proto-oncogene, tyrosine kinase Q12866
1 CD16 2215 FCGR3A Fc fragment of IgG receptor IIIb O75015
2 CD206 4360 MRC1 mannose receptor C-type 1 P22897
3 CRIg 11326 VSIG4 V-set and immunoglobulin domain containing 4 Q9Y279
4 CD163 9332 CD163 CD163 molecule Q86VB7
Unique shape: (16679, 6)
id name ncbi_gene_id gene_symbol gene_name uniprotkb_id
0 CM_MERTK MERTK 10461 MERTK MER proto-oncogene, tyrosine kinase Q12866
1 CM_CD16 CD16 2215 FCGR3A Fc fragment of IgG receptor IIIb O75015
2 CM_CD206 CD206 4360 MRC1 mannose receptor C-type 1 P22897
3 CM_CRIg CRIg 11326 VSIG4 V-set and immunoglobulin domain containing 4 Q9Y279
4 CM_CD163 CD163 9332 CD163 CD163 molecule Q86VB7
After synonyms aggregation: (16452, 7)
id name ncbi_gene_id gene_symbol gene_name uniprotkb_id synonyms
0 CM_MERTK MERTK 10461 MERTK MER proto-oncogene, tyrosine kinase Q12866 NaN
1 CM_CD16 CD16 2215 FCGR3A Fc fragment of IgG receptor IIIb O75015 NaN
2 CM_CD206 CD206 4360 MRC1 mannose receptor C-type 1 P22897 NaN
3 CM_CRIg CRIg 11326 VSIG4 V-set and immunoglobulin domain containing 4 Q9Y279 NaN
4 CM_CD163 CD163 9332 CD163 CD163 molecule Q86VB7 NaN
df_human.to_parquet("human_cellmarker_2.0_CellMarker_lookup.parquet")

Mouse#

df_mouse = preprocess(url, species="mouse")
Original shape: (35197, 20)
species tissue_class tissue_type uberonongology_id cancer_type cell_type cell_name cellontology_id marker Symbol GeneID Genetype Genename UNIPROTID technology_seq marker_source PMID Title journal year
8 Mouse Abdomen Muscle UBERON_0001630 Normal Normal cell Fibro-adipogenic progenitor cell NaN Wisp1 Ccn4 22402 protein_coding cellular communication network factor 4 O54775 10x Chromium Experiment 35439171 An estrogen-sensitive fibroblast population dr... JCI insight 2022
9 Mouse Abdomen Muscle UBERON_0001630 Normal Normal cell Myoblast CL_0000056 Myod1 Myod1 17927 protein_coding myogenic differentiation 1 P10085 10x Chromium Experiment 35439171 An estrogen-sensitive fibroblast population dr... JCI insight 2022
10 Mouse Abdomen Muscle UBERON_0001630 Normal Normal cell Muscle satellite cell CL_0000514 Myf5 Myf5 17877 protein_coding myogenic factor 5 A2RSK4 10x Chromium Experiment 35439171 An estrogen-sensitive fibroblast population dr... JCI insight 2022
11 Mouse Abdomen Muscle UBERON_0001630 Normal Normal cell Myocyte CL_0000187 Ckm Ckm 12715 protein_coding creatine kinase, muscle A2RTA0 10x Chromium Experiment 35439171 An estrogen-sensitive fibroblast population dr... JCI insight 2022
12 Mouse Abdomen Muscle UBERON_0001630 Normal Normal cell Myocyte CL_0000187 Acta1 Acta1 11459 protein_coding actin alpha 1, skeletal muscle P68134 10x Chromium Experiment 35439171 An estrogen-sensitive fibroblast population dr... JCI insight 2022
marker GeneID Symbol Genename UNIPROTID
8 Wisp1 22402 Ccn4 cellular communication network factor 4 O54775
9 Myod1 17927 Myod1 myogenic differentiation 1 P10085
10 Myf5 17877 Myf5 myogenic factor 5 A2RSK4
11 Ckm 12715 Ckm creatine kinase, muscle A2RTA0
12 Acta1 11459 Acta1 actin alpha 1, skeletal muscle P68134
Unique shape: (12503, 6)
id name ncbi_gene_id gene_symbol gene_name uniprotkb_id
0 CM_Wisp1 Wisp1 22402 Ccn4 cellular communication network factor 4 O54775
1 CM_Myod1 Myod1 17927 Myod1 myogenic differentiation 1 P10085
2 CM_Myf5 Myf5 17877 Myf5 myogenic factor 5 A2RSK4
3 CM_Ckm Ckm 12715 Ckm creatine kinase, muscle A2RTA0
4 CM_Acta1 Acta1 11459 Acta1 actin alpha 1, skeletal muscle P68134
After synonyms aggregation: (12323, 7)
id name ncbi_gene_id gene_symbol gene_name uniprotkb_id synonyms
0 CM_Wisp1 Wisp1 22402 Ccn4 cellular communication network factor 4 O54775 NaN
1 CM_Myod1 Myod1 17927 Myod1 myogenic differentiation 1 P10085 NaN
2 CM_Myf5 Myf5 17877 Myf5 myogenic factor 5 A2RSK4 NaN
3 CM_Ckm Ckm 12715 Ckm creatine kinase, muscle A2RTA0 NaN
4 CM_Acta1 Acta1 11459 Acta1 actin alpha 1, skeletal muscle P68134 NaN
df_mouse.to_parquet("mouse_cellmarker_2.0_CellMarker_lookup.parquet")