CellMarker 2.0#
The underlying curation process for bionty.CellMarker.df
import pandas as pd
import numpy as np
url = "http://bio-bigdata.hrbmu.edu.cn/CellMarker/CellMarker_download_files/file/Cell_marker_All.xlsx"
def preprocess(url: str, species: str):
df_cm = pd.read_excel(url, dtype=str)
df_cm = df_cm[df_cm["species"] == species.capitalize()].copy()
print(f"Original shape: {df_cm.shape}")
display(df_cm.head())
# Drop tissue, cell type, tech, journal
df = df_cm[["marker", "GeneID", "Symbol", "Genename", "UNIPROTID"]].copy()
df = df.drop_duplicates(subset=["marker", "GeneID", "UNIPROTID"])
df = df.drop_duplicates(subset=["marker"])
display(df.head())
df.rename(
columns={
"marker": "name",
"GeneID": "ncbi_gene_id",
"Symbol": "gene_symbol",
"Genename": "gene_name",
"UNIPROTID": "uniprotkb_id",
},
inplace=True,
)
# insert an id column
df.insert(0, "id", "CM_" + df["name"].astype(str))
df = df[df["name"].notnull()].copy()
# clean up index
df.reset_index(inplace=True)
df.pop("index")
print(f"Unique shape: {df.shape}")
display(df.head())
# group synonyms
grouped_dfs, df_clean = group_alias_rows(df)
agg_synonyms(grouped_dfs, df_clean)
# clean up index again
df_clean.reset_index(inplace=True)
df_clean.pop("index")
print(f"After synonyms aggregation: {df_clean.shape}")
display(df_clean.head())
return df_clean
def group_alias_rows(df):
allnames = df["name"].unique()
restnames = set(allnames)
grouped_dfs = {}
for name in allnames:
if any([i in name for i in ["α", "β", "γ"]]):
continue
if "-" in name or any([i in name for i in ["alpha", "beta", "gamma"]]):
names = set()
names.add(name)
if any([i in name for i in ["alpha", "beta", "gamma"]]):
name = (
name.replace("alpha", "α")
.replace("beta", "β")
.replace("gamma", "γ")
)
names.add(name)
names.add(name.replace("-", ""))
names.add(name.replace(" ", ""))
names.add(name.replace("-", "").replace(" ", ""))
names.add(name.upper())
names.add(name.upper().replace("-", ""))
names.add(name.upper().replace(" ", ""))
names.add(name.upper().replace("-", "").replace(" ", ""))
names.add(name.capitalize())
names.add(name.capitalize().replace("-", ""))
names.add(name.capitalize().replace(" ", ""))
names.add(name.capitalize().replace("-", "").replace(" ", ""))
ints_names = names.intersection(restnames)
if len(ints_names) > 1:
names_df = df[df["name"].isin(ints_names)].copy()
grouped_dfs[name] = names_df
for n in names_df["name"]:
restnames.remove(n)
df_clean = df[df["name"].isin(restnames)].copy()
return grouped_dfs, df_clean
def agg_synonyms(grouped_dfs, df_clean):
df_clean["synonyms"] = np.nan
for _, df in grouped_dfs.items():
df_not_na = df[~df["ncbi_gene_id"].isna()]
if len(df_not_na) > 0:
append_row = df_not_na.iloc[0]
else:
append_row = df.iloc[0]
index_name = append_row["name"]
synonyms = df[df["name"] != index_name]["name"].unique()
if len(synonyms) > 1:
synonyms_joined = "|".join(synonyms)
else:
synonyms_joined = synonyms[0]
append_row["synonyms"] = synonyms_joined
df_clean.loc[df_clean.index[-1] + 1] = append_row
Human#
df_human = preprocess(url, species="human")
Original shape: (60877, 20)
species | tissue_class | tissue_type | uberonongology_id | cancer_type | cell_type | cell_name | cellontology_id | marker | Symbol | GeneID | Genetype | Genename | UNIPROTID | technology_seq | marker_source | PMID | Title | journal | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Human | Abdomen | Abdomen | UBERON_0000916 | Normal | Normal cell | Macrophage | CL_0000235 | MERTK | MERTK | 10461 | protein_coding | MER proto-oncogene, tyrosine kinase | Q12866 | NaN | Experiment | 31982413 | Peritoneal Level of CD206 Associates With Mort... | Gastroenterology | 2020 |
1 | Human | Abdomen | Abdomen | UBERON_0000916 | Normal | Normal cell | Macrophage | CL_0000235 | CD16 | FCGR3A | 2215 | protein_coding | Fc fragment of IgG receptor IIIb | O75015 | NaN | Experiment | 31982413 | Peritoneal Level of CD206 Associates With Mort... | Gastroenterology | 2020 |
2 | Human | Abdomen | Abdomen | UBERON_0000916 | Normal | Normal cell | Macrophage | CL_0000235 | CD206 | MRC1 | 4360 | protein_coding | mannose receptor C-type 1 | P22897 | NaN | Experiment | 31982413 | Peritoneal Level of CD206 Associates With Mort... | Gastroenterology | 2020 |
3 | Human | Abdomen | Abdomen | UBERON_0000916 | Normal | Normal cell | Macrophage | CL_0000235 | CRIg | VSIG4 | 11326 | protein_coding | V-set and immunoglobulin domain containing 4 | Q9Y279 | NaN | Experiment | 31982413 | Peritoneal Level of CD206 Associates With Mort... | Gastroenterology | 2020 |
4 | Human | Abdomen | Abdomen | UBERON_0000916 | Normal | Normal cell | Macrophage | CL_0000235 | CD163 | CD163 | 9332 | protein_coding | CD163 molecule | Q86VB7 | NaN | Experiment | 31982413 | Peritoneal Level of CD206 Associates With Mort... | Gastroenterology | 2020 |
marker | GeneID | Symbol | Genename | UNIPROTID | |
---|---|---|---|---|---|
0 | MERTK | 10461 | MERTK | MER proto-oncogene, tyrosine kinase | Q12866 |
1 | CD16 | 2215 | FCGR3A | Fc fragment of IgG receptor IIIb | O75015 |
2 | CD206 | 4360 | MRC1 | mannose receptor C-type 1 | P22897 |
3 | CRIg | 11326 | VSIG4 | V-set and immunoglobulin domain containing 4 | Q9Y279 |
4 | CD163 | 9332 | CD163 | CD163 molecule | Q86VB7 |
Unique shape: (16679, 6)
id | name | ncbi_gene_id | gene_symbol | gene_name | uniprotkb_id | |
---|---|---|---|---|---|---|
0 | CM_MERTK | MERTK | 10461 | MERTK | MER proto-oncogene, tyrosine kinase | Q12866 |
1 | CM_CD16 | CD16 | 2215 | FCGR3A | Fc fragment of IgG receptor IIIb | O75015 |
2 | CM_CD206 | CD206 | 4360 | MRC1 | mannose receptor C-type 1 | P22897 |
3 | CM_CRIg | CRIg | 11326 | VSIG4 | V-set and immunoglobulin domain containing 4 | Q9Y279 |
4 | CM_CD163 | CD163 | 9332 | CD163 | CD163 molecule | Q86VB7 |
After synonyms aggregation: (16452, 7)
id | name | ncbi_gene_id | gene_symbol | gene_name | uniprotkb_id | synonyms | |
---|---|---|---|---|---|---|---|
0 | CM_MERTK | MERTK | 10461 | MERTK | MER proto-oncogene, tyrosine kinase | Q12866 | NaN |
1 | CM_CD16 | CD16 | 2215 | FCGR3A | Fc fragment of IgG receptor IIIb | O75015 | NaN |
2 | CM_CD206 | CD206 | 4360 | MRC1 | mannose receptor C-type 1 | P22897 | NaN |
3 | CM_CRIg | CRIg | 11326 | VSIG4 | V-set and immunoglobulin domain containing 4 | Q9Y279 | NaN |
4 | CM_CD163 | CD163 | 9332 | CD163 | CD163 molecule | Q86VB7 | NaN |
df_human.to_parquet("human_cellmarker_2.0_CellMarker_lookup.parquet")
Mouse#
df_mouse = preprocess(url, species="mouse")
Original shape: (35197, 20)
species | tissue_class | tissue_type | uberonongology_id | cancer_type | cell_type | cell_name | cellontology_id | marker | Symbol | GeneID | Genetype | Genename | UNIPROTID | technology_seq | marker_source | PMID | Title | journal | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
8 | Mouse | Abdomen | Muscle | UBERON_0001630 | Normal | Normal cell | Fibro-adipogenic progenitor cell | NaN | Wisp1 | Ccn4 | 22402 | protein_coding | cellular communication network factor 4 | O54775 | 10x Chromium | Experiment | 35439171 | An estrogen-sensitive fibroblast population dr... | JCI insight | 2022 |
9 | Mouse | Abdomen | Muscle | UBERON_0001630 | Normal | Normal cell | Myoblast | CL_0000056 | Myod1 | Myod1 | 17927 | protein_coding | myogenic differentiation 1 | P10085 | 10x Chromium | Experiment | 35439171 | An estrogen-sensitive fibroblast population dr... | JCI insight | 2022 |
10 | Mouse | Abdomen | Muscle | UBERON_0001630 | Normal | Normal cell | Muscle satellite cell | CL_0000514 | Myf5 | Myf5 | 17877 | protein_coding | myogenic factor 5 | A2RSK4 | 10x Chromium | Experiment | 35439171 | An estrogen-sensitive fibroblast population dr... | JCI insight | 2022 |
11 | Mouse | Abdomen | Muscle | UBERON_0001630 | Normal | Normal cell | Myocyte | CL_0000187 | Ckm | Ckm | 12715 | protein_coding | creatine kinase, muscle | A2RTA0 | 10x Chromium | Experiment | 35439171 | An estrogen-sensitive fibroblast population dr... | JCI insight | 2022 |
12 | Mouse | Abdomen | Muscle | UBERON_0001630 | Normal | Normal cell | Myocyte | CL_0000187 | Acta1 | Acta1 | 11459 | protein_coding | actin alpha 1, skeletal muscle | P68134 | 10x Chromium | Experiment | 35439171 | An estrogen-sensitive fibroblast population dr... | JCI insight | 2022 |
marker | GeneID | Symbol | Genename | UNIPROTID | |
---|---|---|---|---|---|
8 | Wisp1 | 22402 | Ccn4 | cellular communication network factor 4 | O54775 |
9 | Myod1 | 17927 | Myod1 | myogenic differentiation 1 | P10085 |
10 | Myf5 | 17877 | Myf5 | myogenic factor 5 | A2RSK4 |
11 | Ckm | 12715 | Ckm | creatine kinase, muscle | A2RTA0 |
12 | Acta1 | 11459 | Acta1 | actin alpha 1, skeletal muscle | P68134 |
Unique shape: (12503, 6)
id | name | ncbi_gene_id | gene_symbol | gene_name | uniprotkb_id | |
---|---|---|---|---|---|---|
0 | CM_Wisp1 | Wisp1 | 22402 | Ccn4 | cellular communication network factor 4 | O54775 |
1 | CM_Myod1 | Myod1 | 17927 | Myod1 | myogenic differentiation 1 | P10085 |
2 | CM_Myf5 | Myf5 | 17877 | Myf5 | myogenic factor 5 | A2RSK4 |
3 | CM_Ckm | Ckm | 12715 | Ckm | creatine kinase, muscle | A2RTA0 |
4 | CM_Acta1 | Acta1 | 11459 | Acta1 | actin alpha 1, skeletal muscle | P68134 |
After synonyms aggregation: (12323, 7)
id | name | ncbi_gene_id | gene_symbol | gene_name | uniprotkb_id | synonyms | |
---|---|---|---|---|---|---|---|
0 | CM_Wisp1 | Wisp1 | 22402 | Ccn4 | cellular communication network factor 4 | O54775 | NaN |
1 | CM_Myod1 | Myod1 | 17927 | Myod1 | myogenic differentiation 1 | P10085 | NaN |
2 | CM_Myf5 | Myf5 | 17877 | Myf5 | myogenic factor 5 | A2RSK4 | NaN |
3 | CM_Ckm | Ckm | 12715 | Ckm | creatine kinase, muscle | A2RTA0 | NaN |
4 | CM_Acta1 | Acta1 | 11459 | Acta1 | actin alpha 1, skeletal muscle | P68134 | NaN |
df_mouse.to_parquet("mouse_cellmarker_2.0_CellMarker_lookup.parquet")