TCGA数据库8个应用领域之3——UCSCXenaTools探索各类 ...

3

主题

7

帖子

12

积分

新手上路

Rank: 1

积分
12
发表于 2022-12-1 13:08:23 | 显示全部楼层
这次我们来探索各类肿瘤与对照的全局(如mRNA,lncRNA,miRNA,甲基化,蛋白)水平的差异情况(差异分析流程),也就是各类型肿瘤的turmorl和normal的差异分析,用的包是UCSCXenaTools,UCSCXenaTools包以往的介绍见:通过R包UCSCXenaTools链接UCSC的XENA浏览器来探索TCGA等公共数据、UCSCXenaTools介绍。
设置

rm(list=ls())options(stringsAsFactors = F)library(data.table)library(dplyr)library(stringi)#install.packages("UCSCXenaTools")library(UCSCXenaTools)通过标准分析流程五步走

生成 XenaHub 对象
#UCSCXenaTools 使用包内置数据集 XenaData 辅助生成 XenaHub 对象,这个数据集记录了当前所有数据集的信息。data(XenaData) head(XenaData)#> # A tibble: 6 x 17#>   XenaHosts XenaHostNames XenaCohorts XenaDatasets SampleCount DataSubtype#>   <chr>     <chr>         <chr>       <chr>              <int> <chr>      #> 1 https://… publicHub     Breast Can… ucsfNeve_pu…          51 gene expre…#> 2 https://… publicHub     Breast Can… ucsfNeve_pu…          57 phenotype  #> 3 https://… publicHub     Glioma (Ko… kotliarov20…         194 copy number#> 4 https://… publicHub     Glioma (Ko… kotliarov20…         194 phenotype  #> 5 https://… publicHub     Lung Cance… weir2007_pu…         383 copy number#> 6 https://… publicHub     Lung Cance… weir2007_pu…         383 phenotype  #> # … with 11 more variables: Label <chr>, Type <chr>,#> #   AnatomicalOrigin <chr>, SampleType <chr>, Tags <chr>, ProbeMap <chr>,#> #   LongTitle <chr>, Citation <chr>, Version <chr>, Unit <chr>,#> #   Platform <chr>#从'XenaData'生成XenaHub对象,取子集xe = XenaGenerate(subset = XenaHostNames == "gdcHub")过滤数据
xe2 = XenaFilter(xe, filterDatasets = "htseq_counts") xe2@datasets# [1] "TCGA-BLCA.htseq_counts.tsv"     "TCGA-LUSC.htseq_counts.tsv"    # [3] "TCGA-ESCA.htseq_counts.tsv"     "TARGET-RT.htseq_counts.tsv"    # [5] "MMRF-COMMPASS.htseq_counts.tsv" "TCGA-MESO.htseq_counts.tsv"    # [7] "TCGA-LUAD.htseq_counts.tsv"     "TCGA-STAD.htseq_counts.tsv"    # [9] "TCGA-TGCT.htseq_counts.tsv"     "TCGA-UVM.htseq_counts.tsv"     # [11] "TCGA-PAAD.htseq_counts.tsv"     "TCGA-GBM.htseq_counts.tsv"     # [13] "TCGA-COAD.htseq_counts.tsv"     "TARGET-NBL.htseq_counts.tsv"   # [15] "TCGA-OV.htseq_counts.tsv"       "TCGA-THYM.htseq_counts.tsv"    # [17] "TCGA-BRCA.htseq_counts.tsv"     "TCGA-UCEC.htseq_counts.tsv"    # [19] "TCGA-UCS.htseq_counts.tsv"      "TARGET-WT.htseq_counts.tsv"    # [21] "TARGET-AML.htseq_counts.tsv"    "TCGA-CHOL.htseq_counts.tsv"    # [23] "TARGET-OS.htseq_counts.tsv"     "TCGA-KIRC.htseq_counts.tsv"    # [25] "TCGA-ACC.htseq_counts.tsv"      "TCGA-LGG.htseq_counts.tsv"     # [27] "TCGA-KIRP.htseq_counts.tsv"     "TCGA-KICH.htseq_counts.tsv"    # [29] "TCGA-THCA.htseq_counts.tsv"     "TARGET-ALL-P3.htseq_counts.tsv"# [31] "TCGA-LIHC.htseq_counts.tsv"     "TCGA-HNSC.htseq_counts.tsv"    # [33] "TCGA-CESC.htseq_counts.tsv"     "TCGA-READ.htseq_counts.tsv"    # [35] "TCGA-PCPG.htseq_counts.tsv"     "TARGET-CCSK.htseq_counts.tsv"  # [37] "TCGA-SARC.htseq_counts.tsv"     "TCGA-DLBC.htseq_counts.tsv"    # [39] "TCGA-PRAD.htseq_counts.tsv"     "TCGA-LAML.htseq_counts.tsv"    # [41] "TCGA-SKCM.htseq_counts.tsv"检索数据
xe2_query = XenaQuery(xe2)head(xe2_query)#                      hosts                   datasets# 1 https://gdc.xenahubs.net TCGA-BLCA.htseq_counts.tsv# 2 https://gdc.xenahubs.net TCGA-LUSC.htseq_counts.tsv# 3 https://gdc.xenahubs.net TCGA-ESCA.htseq_counts.tsv# 6 https://gdc.xenahubs.net TCGA-MESO.htseq_counts.tsv# 7 https://gdc.xenahubs.net TCGA-LUAD.htseq_counts.tsv# 8 https://gdc.xenahubs.net TCGA-STAD.htseq_counts.tsv# url# 1 https://gdc.xenahubs.net/download/TCGA-BLCA.htseq_counts.tsv.gz# 2 https://gdc.xenahubs.net/download/TCGA-LUSC.htseq_counts.tsv.gz# 3 https://gdc.xenahubs.net/download/TCGA-ESCA.htseq_counts.tsv.gz# 6 https://gdc.xenahubs.net/download/TCGA-MESO.htseq_counts.tsv.gz# 7 https://gdc.xenahubs.net/download/TCGA-LUAD.htseq_counts.tsv.gz# 8 https://gdc.xenahubs.net/download/TCGA-STAD.htseq_counts.tsv.gzsave(xe2_query, file = "03-DEG/xe2_query.RData")下载和导入数据放在画图的循环里。
循环画图

load("03-DEG/xe2_query.RData")xe2_query = xe2_query[substr(xe2_query$datasets,1,4)== 'TCGA',]   #去掉其它数据库来源的数据,因为分组规则不一样(不一定按01-11分组),后续画图会出错destdir = "03-DEG/xena_TCGA_data" #下载数据存入的文件夹library(limma)for (i in 1:length(xe2_query$datasets) ) {  #i = 14  xe2_query2 = xe2_query[i,]    #下载数据,已经下载了会读入  xe2_download = XenaDownload(xe2_query2, destdir = destdir,                               trans_slash = TRUE)#导入数据   expr = XenaPrepare(xe2_download)   expr$Ensembl_ID=stri_sub(expr$Ensembl_ID,1,15)##保留前15位,去掉小数点  row.names(expr)=expr$Ensembl_ID  expr2 =expr[,-1]  row.names(expr2)=expr$Ensembl_ID  expr2=normalizeBetweenArrays(expr2)  #转换一下 id  library(tinyarray)  expr2 = trans_exp(expr2)  #tumor和normal分组信息  group = ifelse(as.numeric(substr(colnames(expr2),14,15)) < 10,'tumor','normal')  #group = ifelse(as.numeric(substr(colnames(expr2),nchar(colnames(expr2))-2,nchar(colnames(expr2))-1)) < 10,'tumor','normal')    group <- factor(group, levels = c('normal','tumor')) #因为有一些癌症数据集是没有normal样本的,所以要做一个判断,且使用normal样本大于15的数据 if (sum(group=="normal")>15) {     #if ("normal" %in% group) {     #转录组数据用limma做差异分析    library(limma)    dge <- edgeR::DGEList(counts=expr2)    dge <- edgeR::calcNormFactors(dge)    design=model.matrix(~factor( group ))    fit = voom(dge,design, normalize="quantile")    fit=lmFit(expr2,design)    fit=eBayes(fit)       options(digits = 4) #设置全局的数字有效位数为4    deg = topTable(fit,coef=2,adjust='BH', n=Inf)       #获取具体的癌症名字    cancer = xe2_query2$datasets    cancer = gsub("TCGA-","",cancer)    cancer = gsub(".htseq_counts.tsv","",cancer)    cancer    save(deg, file=paste0("03-DEG/",cancer,"_limma_deg.Rdata"))     #取上下调显著的前10个基因画图    logFC_t=1    P.Value_t = 0.05        k1 = (deg$P.Value < P.Value_t)&(deg$logFC < -logFC_t)    k2 = (deg$P.Value < P.Value_t)&(deg$logFC > logFC_t)    deg <- mutate(deg,change = ifelse(k1,"down",ifelse(k2,"up","stable")))    library(dplyr)    dat2 = deg %>%      filter(change!="stable") %>%      arrange(logFC)  #把上下调基因按logFC排序     cg = c(head(rownames(dat2),5),           tail(rownames(dat2),5))    #火山图    library(EnhancedVolcano)    EnhancedVolcano(deg,                        lab =  rownames(deg),                        x = 'logFC',                        y = 'P.Value',                        pCutoff = 0.05,                        FCcutoff = 1,                        labSize = 5,                        selectLab = cg,                        drawConnectors = TRUE #EnhancedVolcano显示基因名会默认重叠就不显示,加上这个参数才会错开显示    )    ggsave(width = 10, height = 8, filename = paste0("03-DEG/",cancer,"_limma_deg.png"))     }}有14个癌症的normal数据大于15个画出了图,放三个看看:
COAD

STAD

THCA
回复

举报 使用道具

您需要登录后才可以回帖 登录 | 立即注册
快速回复 返回顶部 返回列表