Codes in paper

10X single-cell RNA sequencing QC (known sample SNP)

Introduction

In this paper, single-cell 10X RNA sequencing was performed on a mixture of a variety of known cell lines to study the interaction mode between polyclones. Here we introduce the operation of filtering doublets and low quality cells in single cell sequencing.

Code explanation

Load the required tool function

library(mclust)
library(ggplot2)

# load counts matrix, genes, and cell identities for a given experiment
load_sc_exp <- function(data_folder) {
  rMat <- Matrix::readMM(file.path(data_folder, 'matrix.mtx')) %>% as.matrix()
  genes <- readr::read_tsv(file.path(data_folder, 'genes.tsv'), col_names = F) %>% 
    magrittr::set_colnames(c('Ensembl_ID', 'Gene_Symbol'))
  barcodes <- readr::read_tsv(file.path(data_folder, 'barcodes.tsv'), col_names = F) 
  colnames(rMat) <- barcodes$X1
  rownames(rMat) <- genes$Ensembl_ID
  num_CLs <- ncol(rMat)
  num_genes <- nrow(rMat)
  classification <- barcodes
  if (file.exists(file.path(data_folder, 'classifications.csv'))) {
    classification <- readr::read_csv(file.path(data_folder, 'classifications.csv')) %>% as.data.frame()
  }
  cat(sprintf('Loaded matrix with %d CLs and %d genes\n', num_CLs, num_genes))

  return(list(counts = rMat, gene_info = genes, cell_info = classification))
}
# convert gene identifiers from ensembl IDs to hugo symbols
convert_to_hugo_symbols <- function (dat) {
  ensemble_to_hugo <- dat$gene_info$Gene_Symbol %>% magrittr::set_names(dat$gene_info$Ensembl_ID)
  rownames(dat$counts) %<>% ensemble_to_hugo[.] %>% magrittr::set_names(NULL)
  dat$counts <- dat$counts[!is.na(rownames(dat$counts)), ]
  dat$counts <- dat$counts[!duplicated(rownames(dat$counts)), ]
  return(dat)
}
# input data object containing the counts, cell info (cell line classifications), and gene info
# for the experiment
create_seurat_obj <- function(dat, use_symbols = TRUE) {
  if(use_symbols) {
    dat <- dat %>%
      convert_to_hugo_symbols() #convert genes to hugo symbols (and only keep unique hugo symbol genes)
  }
  rownames(dat$cell_info) <- dat$cell_info$barcode

  #make into Seurat object
  seuObj <- Seurat::CreateSeuratObject(dat$counts, 
                                       min.cells = 0,
                                       min.features = 0,
                                       meta.data = dat$cell_info)

  #make singlet classifications the cell identifiers
  seuObj <- Seurat::SetIdent(seuObj, value = seuObj@meta.data$singlet_ID)

  #store gene info here
  seuObj@misc <- dat$gene_info

  #add mitochondrial gene fraction
  seuObj<- Seurat::PercentageFeatureSet(seuObj, pattern = "^MT-", col.name = 'percent.mito')

  #add cellular detection rate (fraction of genes detected in a given cell)
  seuObj@meta.data$cell_det_rate <- seuObj$nFeature_RNA/nrow(seuObj@assays$RNA@counts)

  return(seuObj)
}
# begin adding cell quality classifications
# classify cells with too few SNPs or abnormal mitochondrial read percentages as low quality
filter_low_quality_cells <- function(seuObj, min_SNPs = 50, max_mito_perc = 25, min_mito_perc=1) {

  #too few SNPs
  seuObj@meta.data <- seuObj@meta.data %>%
    dplyr::mutate(cell_quality = ifelse(num_SNPs < min_SNPs, 'low_quality', 'normal'))

  #abnormal mitochondrial read %
  seuObj@meta.data <- seuObj@meta.data %>%
    dplyr::mutate(cell_quality = ifelse(percent.mito > max_mito_perc | percent.mito < min_mito_perc, 'low_quality', cell_quality))

  return(seuObj)
}
# run Seurat methods, excluding low quality cells
process_Seurat_obj <- function(seuObj, n_pcs = 50, cluster_k =10, clust_res = 1, plot = TRUE) {

  all_meta <- seuObj@meta.data
  rownames(all_meta) <- all_meta$barcode

  rownames(seuObj@meta.data) <- seuObj@meta.data$barcode

  seuObj <- Seurat::SubsetData(seuObj, cells = dplyr::filter(seuObj@meta.data, cell_quality == 'normal')$barcode)
  # run Seurat methods to get gene expression clusters
  seuObj <- Seurat::NormalizeData(object = seuObj, 
                                  normalization.method = "LogNormalize", 
                                  scale.factor = 1e5, verbose = FALSE)

  seuObj <- Seurat::ScaleData(object = seuObj, vars.to.regress = c(), verbose = FALSE)

  seuObj <- Seurat::FindVariableFeatures(object = seuObj,
                                         nfeatures = 5000,
                                         do.plot = FALSE,
                                         selection.method = 'vst', verbose = FALSE)

  seuObj <- Seurat::RunPCA(object = seuObj,
                           features = Seurat::VariableFeatures(seuObj),
                           seed.use = 1,
                           npcs = n_pcs,
                           verbose = FALSE)

  seuObj <- Seurat::RunTSNE(object = seuObj,
                            dims = 1:n_pcs,
                            check_duplicates = FALSE,
                            seed.use = 1,
                            perplexity = 30,
                            verbose = FALSE)  

  seuObj <- Seurat::FindNeighbors(seuObj, reduction = 'pca',
                                  dims = 1:n_pcs,
                                  k.param = cluster_k, 
                                  force.recalc = TRUE,
                                  verbose = FALSE)
  seuObj <- Seurat::FindClusters(seuObj, resolution = clust_res, random.seed = 1, verbose = FALSE)

  if(plot) {
    gg <- Seurat::DimPlot(seuObj, group.by = 'seurat_clusters') + Seurat::NoLegend()
    print(gg)
  }

  all_meta <- dplyr::left_join(all_meta, seuObj@meta.data[,c('barcode', 'RNA_snn_res.1','seurat_clusters')])
  seuObj@meta.data <- all_meta
  return(seuObj)

}
# identify low quality 'cells', which may be from empty droplets
empty_droplet_clusters <- function(seuObj, dev_thresh = 0.3, plot = TRUE) {
  df <- Seurat::Embeddings(seuObj, reduction = 'tsne') %>% as.data.frame() %>%
    tibble::rownames_to_column(var = 'barcode') %>%
    dplyr::left_join(seuObj@meta.data %>% 
                       dplyr::mutate(max_dev = singlet_dev + doublet_dev_imp), by='barcode')

  #compute stats for each cluster
  clust_stats <- plyr::ddply(df, .(seurat_clusters), function(ss) {
    data_frame(n_cells = nrow(ss),
               med_doub = median(ss$doublet_dev_imp, na.rm=T),
               med_dev = median(ss$max_dev, na.rm=T))
  }) %>% dplyr::mutate(
    clust_type = ifelse(med_dev <= dev_thresh, 'empty_droplet', 'normal')
  )

  empty_droplets <- df$barcode[which((clust_stats$clust_type %>% rlang::set_names(clust_stats$seurat_clusters) %>%
                                        .[df$seurat_clusters]) == 'empty_droplet')]
  print(sprintf('Identified %d empty droplets', length(empty_droplets)))

  rownames(seuObj@meta.data) <- seuObj@meta.data$barcode
  seuObj@meta.data[empty_droplets,'cell_quality'] <-  'empty_droplet'

  if(plot) {
    gg <- Seurat::DimPlot(seuObj, group.by = 'cell_quality')
    print(gg)
  }
  return(seuObj)
}
# Identify doublets
classify_doublets <- function(seuObj, doub_prob_thresh=0.5, doublet_dev_offset = 0.001, plot = TRUE) {
  # use singlet deviance, doublet deviance improvement, and fraction of genes detected in a given cell
  # to identify cells that are doublets
  # doublet identification

  doublet_df <- Seurat::FetchData(seuObj, c('singlet_dev', 'doublet_dev_imp', 'cell_quality')) %>% 
    tibble::rownames_to_column(var = 'barcode') %>% 
    dplyr::filter(cell_quality %in% c('normal', 'doublet')) %>% 
    dplyr::mutate(log_doublet_imp = log10(doublet_dev_imp + doublet_dev_offset)) %>% 
    dplyr::select(-doublet_dev_imp) 

  # handle cells where the doublet model failed by treating them as singlets.
  fail_doublet_model <- doublet_df %>% 
    dplyr::filter(is.na(log_doublet_imp)) %>% 
    .[['barcode']]
  print(sprintf('%d cells with failed doublet models treated as singlets', length(fail_doublet_model)))

  doublet_df %<>% dplyr::filter(!(barcode %in% fail_doublet_model))

  # sfit GMM
  gmm_output <- mclust::Mclust(as.matrix(doublet_df[, c('log_doublet_imp', 'singlet_dev')]), G = 2, verbose = FALSE, prior = mclust::priorControl(shrinkage = 0))

  doublet_cluster <- which.max(gmm_output$parameters$mean['log_doublet_imp', ])
  singlet_cluster <- which.min(gmm_output$parameters$mean['log_doublet_imp', ])
  doublet_df$doublet_prob <- gmm_output$z[,doublet_cluster]
  doublet_cells <- doublet_df %>% 
    dplyr::filter(doublet_prob >= doub_prob_thresh) %>% 
    .[['barcode']]

  seuObj@meta.data[doublet_cells, 'cell_quality'] <- 'doublet'

  if(plot) {
    g1 <- plot(gmm_output, what = 'classification')

    g2 <- ggplot2::ggplot(doublet_df, ggplot2::aes(singlet_dev, log_doublet_imp, color=doublet_prob)) + 
      ggplot2::geom_point() + 
      ggplot2::theme(legend.position = "right", legend.direction = "vertical", legend.text = ggplot2::element_text(size=10)) +
      ggplot2::xlab("singlet deviance") + ggplot2::ylab("log doublet deviance improvement")

    g3 <- ggplot2::ggplot(doublet_df %>% dplyr::mutate(is_doublet = barcode %in% doublet_cells), ggplot2::aes(singlet_dev, log_doublet_imp, color= is_doublet)) +
      ggplot2::geom_point() +
      ggplot2::theme(legend.position = "right", legend.direction = "vertical", legend.text =ggplot2::element_text(size=10)) +
      ggplot2::xlab("singlet deviance") + ggplot2::ylab("log doublet deviance improvement") +
      ggplot2::guides(color = ggplot2::guide_legend(title = 'doublet'))

    print(g1)
    print(g2)
    print(g3)
  }
  return(seuObj)
}

identify_low_confidence_cells <- function(seuObj, min_z_margin = 2, plot = TRUE) {
  df <- seuObj@meta.data

  seuObj@meta.data %<>% 
    dplyr::mutate(cell_quality = ifelse(seuObj@meta.data$singlet_z_margin < min_z_margin & seuObj@meta.data$cell_quality == 'normal', 'low_confidence', cell_quality))

  if(plot) {
    cell_class_confidence <- ggplot2::ggplot(df %>% filter(cell_quality == 'normal'),
                                             ggplot2::aes(singlet_z_margin)) + 
      ggplot2::geom_density() + 
      ggplot2::geom_vline(xintercept = min_z_margin, linetype = 'dashed')

    print(cell_class_confidence)
    gg <- ggplot2::ggplot(seuObj@meta.data, ggplot2::aes(singlet_dev, doublet_dev_imp, fill=cell_quality)) + 
      ggplot2::geom_point(pch = 21, size = 1.5, color = 'white', stroke = 0.1) + ggplot2::ggtitle('cell quality classifications')  
    print(gg)
  }

  return(seuObj)
}

To read in the data, the data is placed under a folder named data, not a single_cell_classification folder, which includes the output matrix.mtx, genes.tsv, barcodes.tsv of Cellranger, and a cell classification file identified by SNP. classifications.csv

  # run QC methods
  dat <- load_sc_exp(here::here('data'))
  seuObj <- create_seurat_obj(dat)
  num_CLs <- length(unique(seuObj@meta.data$singlet_ID))

Set the parameters according to the number of sequenced cell lines, n_pcs is the number of principal components in principal component analysis, and clust_res is the resolution parameter of running FindCluster function

  # set parameters based on the number of cell lines in the experiment
  n_pcs <- num_CLs*2
  param_range <- 10
  if(num_CLs < 25+param_range) {
    clust_res <- 1
  } else if(num_CLs > 50-param_range & num_CLs < 50+param_range) {
    clust_res <- 2
  } else if(num_CLs > 100-param_range & num_CLs < 100+param_range) {
    clust_res <- 4
  } else {
    stop('Not implemented')
  }

Filter operation


  seuObj <- filter_low_quality_cells(seuObj)
  seuObj <- process_Seurat_obj(seuObj, n_pcs = n_pcs,  clust_res = clust_res)
  seuObj <- empty_droplet_clusters(seuObj)
  seuObj <- classify_doublets(seuObj)
  seuObj <- identify_low_confidence_cells(seuObj)

References

Kinker G S , Greenwald A C , Tal R , et al. Pan-cancer single-cell RNA-seq identifies recurring programs of cellular heterogeneity[J]. Nature Genetics, 2020, 52(11):1208-1218.

Original

https://github.com/broadinstitute/single_cell_classification

Leave a Reply

Your email address will not be published. Required fields are marked *