00-03 TCGA差异分析
xena TCGA的counts数据做差异分析
语法
#### ####:R中的目录,大标题
##:标题
#:注释
#@:阶段性成果
#!:值得注意
#?:存疑
代码
##设置工作目录
setwd("xena")
##安装包
install.packages("tidyverse")
##使用包,每次重新打开R都要library一下 或者右下勾选
library(tidyverse)
#xena数据库官网 包括TCGA https://xenabrowser.net/datapages/
##文件的读取和处理
#读取tsv文件
counts1 = read.table(file = 'TCGA-LIHC.htseq_counts.tsv', sep = '\t', header = TRUE)
#为新表x赋值1到3列
#x <- counts1[,1:3]
#改行名
rownames(counts1) <- counts1[,1] #Alt按-可打出<-
#只去除第一列
counts1 = counts1[,-1]
#介绍substr函数,选取字符串的1到4位
#substr("wanglihong",1,4)
#table函数,分组计算
table(substr(colnames(counts1),14,16))
#集合c,含两个值
#c("01A","11A")
#%in%符号用于判断是否属于,筛选出属于的
counts1 <- counts1[,substr(colnames(counts1),14,16)%in% c("01A","11A")]
#再table一次,看是否筛选出来了
table(substr(colnames(counts1),14,16))
#保留行名前15位
rownames(counts1) <- substr(rownames(counts1),1,15)
#ceiling函数,取大于目标值的最小整数
#ceiling(1.2)
#对counts1作数据转换,还原到原数据(原网页显示表中数据是原数据转化而来)
counts <- ceiling(2^(counts1)-1)
##文件的输出
#输出为文本
write.table(counts,"counts.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
#@输出为表格,到这一步由counts1得到原数据counts的表
write.csv(counts, file = "counts.csv")
##24.12.14 03 TCGA差异分析
#设置工作目录
setwd("xena")
#读取之前txt格式的counts
counts <- read.table("counts.txt",sep = "\t",row.names = 1,check.names = F,stringsAsFactors = F,header = T)
#加载基因注释文件
Ginfo_0 <- read.table("gene_length_Table.txt",sep = "\t",check.names = F,stringsAsFactors = F,header = T,row.names = 1)
#美元符号代表提取列,Ginfo_0
#注意两个等号
#!我发现27行其实可以更精确从列选取
#取行名交集,获得一组行名的值,命名为comgene
comgene <- intersect(rownames(counts),rownames(Ginfo))
#选出counts中这些行名的行,即编码RNA的基因的行
counts <- counts[comgene,]
#判断数据类型,如"data.frame"是数据框
#class(counts)
#class(comgene) "character"字符
#?up说这里为已经是19627行的Ginfo赋值是为了使行顺序和comgene一致,我认为确实能使顺序一致,但我怀疑在此处本来顺序就是一致的。因为取交集之前两者的行都是按顺序排列。
Ginfo <- Ginfo[comgene,]
#判断两者行名是否完全相同(包括顺序)
a <- rownames(counts)
b <- rownames(Ginfo)
identical(a,b)
#把Ginfo里的列作为字符给counts新增一列为Gene。新增Gene Symbol
counts
#?去掉Gene重复的行,为什么基因名字重复而基因id不重复呢,!应该表示非。当然,后面既然使用基因名作为行名,自然保险起见应该以基因名去重复
counts <- counts[!duplicated(counts
#计算有多少列
ncol(Ginfo)
#计算有多少行
nrow
#去除最后一列,列的总数即最后一列
counts <- counts[,-ncol(counts)]
#@输出筛选后的肝癌的编码RNA的基因的表达量的文件,到这一步counts加工了行名为genename
write.table(counts, file = "LIHC_counts_mRNA_all.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
#保存counts中01A即癌症患者的列名,其实跟27行一样,但是只有一个值,不需要集合,从而可以用双等于也不需要用%in%
#但有一个很大的不同,27行是表格中的列赋值到另一个表,这里是列名赋值到一组字符串
#?为什么这里的格式是这个呢,方括号一般是表啊,难道表示判断也可以。哦,也许是一组字符串的第几个也可以用方括号。
tumor <- colnames(counts)[substr(colnames(counts),14,16) == "01A"]
counts_01A <- counts[,tumor]
write.table(counts_01A, file = "LIHC_counts_mRNA_01A.txt",sep = "\t",row.names = T,col.names = NA,quote = F)
#差异分析
library(tidyverse)
#安装BiocManager
library(BiocManager)
#通过BiocManager安装DESeq2,Update all/some/none? [a/s/n]: n 似乎跟tidyverse不一样只用来安装
ifinstall('DESeq2'
library(DESeq2)
#不知道
counts = counts[apply(counts, 1, function(x) sum(x > 1) > 32), ]
#分组,是tumor与normal相比
conditions=data.frame(sample=colnames(counts),
group=factor(ifelse(substr(colnames(counts),14,16) == "01A","T","N"),levels = c("N","T"))) %>%
column_to_rownames("sample")
#不知道
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = conditions,
design = ~ group)
#跑差异分析
dds <- DESeq(dds)
#"Intercept" "group_T_vs_N"
resultsNames(dds)
#提取结果res
res <- results(dds)
#一定要保存!保存之后以后可以直接用 DEG:differentially expressed genes 差异性表达的基因
#读取差异基因文件 直接在文件夹双击
save(res,file = "LIHC_DEG.rda")
#p值根据自己需要,abs是绝对值,大于0的很多,大于3的会少一点,表示基因在肿瘤中是正常的三倍以上
res_deseq2 <- as.data.frame(res)%>%
arrange(padj) %>%
dplyr::filter(abs(log2FoldChange) > 3, padj < 0.05)
笔记补充
setwd("xena")#set work directory
ctrl f#查找
install.packages("tidyverse")
library(tidyverse)#每次重新打开R都要library一下
TCGA / 癌症简称 / 缩写 / TCGA癌症中英文对照
Abbr | 英文名称 | 中文名称 |
---|---|---|
ACC | Adrenocortical carcinoma | 肾上腺皮质癌 |
BLCA | Bladder Urothelial Carcinoma | 膀胱尿路上皮癌 |
BRCA | Breast invasive carcinoma | 乳腺浸润癌 |
CESC | Cervical squamous cell carcinoma and endocervical adenocarcinoma | 宫颈鳞癌和腺癌 |
CHOL | Cholangiocarcinoma | 胆管癌 |
COAD | Colon adenocarcinoma | 结肠癌 |
COADREAD | Colon adenocarcinoma/Rectum adenocarcinoma Esophageal carcinoma | 结直肠癌 |
DLBC | Lymphoid Neoplasm Diffuse Large B-cell Lymphoma | 弥漫性大B细胞淋巴瘤 |
ESCA | Esophageal carcinoma | 食管癌 |
FPPP | FFPE Pilot Phase II | FFPE试点二期 |
GBM | Glioblastoma multiforme | 多形成性胶质细胞瘤 |
GBMLGG | Glioma | 胶质瘤 |
HNSC | Head and Neck squamous cell carcinoma | 头颈鳞状细胞癌 |
KICH | Kidney Chromophobe | 肾嫌色细胞癌 |
KIPAN | Pan-kidney cohort (KICH+KIRC+KIRP) | 混合肾癌 |
KIRC | Kidney renal clear cell carcinoma | 肾透明细胞癌 |
KIRP | Kidney renal papillary cell carcinoma | 肾乳头状细胞癌 |
LAML | Acute Myeloid Leukemia | 急性髓细胞样白血病 |
LGG | Brain Lower Grade Glioma | 脑低级别胶质瘤 |
LIHC | Liver hepatocellular carcinoma | 肝细胞肝癌 |
LUAD | Lung adenocarcinoma | 肺腺癌 |
LUSC | Lung squamous cell carcinoma | 肺鳞癌 |
MESO | Mesothelioma | 间皮瘤 |
OV | Ovarian serous cystadenocarcinoma | 卵巢浆液性囊腺癌 |
PAAD | Pancreatic adenocarcinoma | 胰腺癌 |
PCPG | Pheochromocytoma and Paraganglioma | 嗜铬细胞瘤和副神经节瘤 |
PRAD | Prostate adenocarcinoma | 前列腺癌 |
READ | Rectum adenocarcinoma | 直肠腺癌 |
SARC | Sarcoma | 肉瘤 |
SKCM | Skin Cutaneous Melanoma | 皮肤黑色素瘤 |
STAD | Stomach adenocarcinoma | 胃癌 |
STES | Stomach and Esophageal carcinoma | 胃和食管癌 |
TGCT | Testicular Germ Cell Tumors | 睾丸癌 |
THCA | Thyroid carcinoma | 甲状腺癌 |
THYM | Thymoma | 胸腺癌 |
UCEC | Uterine Corpus Endometrial Carcinoma | 子宫内膜癌 |
UCS | Uterine Carcinosarcoma | 子宫肉瘤 |
UVM | Uveal Melanoma | 葡萄膜黑色素瘤 |
counts1 = read.table(file = 'TCGA-LIHC.htseq_counts.tsv', sep = '\t', header = TRUE)#读取tsv文件
counts1 = read.table(file = ''TCGA-LIHC.htseq_counts.tsv'', sep = '\t', header = TRUE)#读取tsv 文件
a == b ,这是数学的等于,a = b是赋值
ENSG00000000003.13 | |
---|---|
基因编号 |
TCGA中样品取样组织类型编号对照包 sample type
01A 肿瘤
11A 正常
Sample Type Codes | NCI Genomic Data Commons
01 | Primary Solid Tumor | TP(实体瘤) |
---|---|---|
02 | Recurrent Solid Tumor | TR |
03 | Primary Blood Derived Cancer - Peripheral Blood | TB (血液相关癌) |
04 | Recurrent Blood Derived Cancer - Bone Marrow | TRBM |
05 | Additional - New Primary | TAP |
06 | Metastatic | TM |
07 | Additional Metastatic | TAM |
08 | Human Tumor Original Cells | THOC |
09 | Primary Blood Derived Cancer - Bone Marrow | TBM |
10 | Blood Derived Normal | NB (血液癌,正常) |
11 | Solid Tissue Normal | NT(实体瘤,正常) |
12 | Buccal Cell Normal | NBC |
13 | EBV Immortalized Normal | NEBV |
14 | Bone Marrow Normal | NBM |
15 | sample type 15 | 15SH |
16 | sample type 16 | 16SH |
20 | Control Analyte | CELLC |
40 | Recurrent Blood Derived Cancer - Peripheral Blood | TRB |
50 | Cell Lines | CELL |
60 | Primary Xenograft Tissue | XP |
61 | Cell Line Derived Xenograft Tissue | XCL |
99 | sample type 99 | 99SH |
alt 再按 - ,在R里可以打出<-,类似于等号
改行名 改列的位置
rownames 行名
colnames 列名
table函数 分组运算
筛选列
c("01A","11A"),把两者放到一个集合里
UCSC Xena
unit =log2(count+1)
count=2^counts -1