其他分享
首页 > 其他分享> > ATAC-seq实战代码

ATAC-seq实战代码

作者:互联网

自己整理,测试完全可用。从直接下载数据到一般的出图

#!/usr/bin/bash
#ly_20211215_atac-seq pepiline
#从下载数据到分析出图的一般流程
######################################
start_time=$(date +%s)

#必要索引文件
bt2_index=/home/data/ssy49/data/index/mm10  #一些物种可以直接从bowtie官网下载索引: http://bowtie-bio.sourceforge.net/index.shtml
computeMatrix_tmp=/home/data/ssy49/data/index/ucsc_mm10_RefSeq_par.bed #从UCSC下载
R_scrip01=/home/data/ssy49/manscript/R/02_plotInsertSize.R #R包代码看下面

#建立文件夹
#mkdir -p ~/LX/atac
#cd ~/LX/atac
#:<<"LY1"
mkdir sra raw qc clean align peaks motif igv tss

:<<"LY1"
temp_dir=$(dirname `which conda`)
dir_list=$(cd ${temp_dir};cd ../envs;ls)
etc_dir=$(cd ${temp_dir};cd ../etc/profile.d/)
echo ${dir_list}
echo ${dir_list}
result=$(echo ${dir_list} |grep "atac")
if [[ "${result}" != "" ]];then
        echo "The atac dir is exist"
        source ${etc_dir}conda.sh
        conda activate atac
else
        echo "OK! I will create the atac environment"
        conda create -n atac -y
        conda activate atac
        conda install -y bedtools homer sra-tools deeptools homer meme bowtie bowtie2 trim-galore picard bedtools macs2 genrich sambamba idr
fi
LY1

##数据准备
#构建所需数据的列表acc_list.txt
#下载数据SRP173505,先获得要下载数据的列表
#nohup prefetch --option-file acc_List.txt --output-directory /home/data/vip10t01/LX/atac & #【这种是单线程下载,并行还是要用循环】这种下载每个文件会对应单独的文件夹,为方便分析,需要将所有的.SRA放在同一文件夹中
for i in `cat acc_list.txt`; do prefetch ${i} --output-directory ~/LX/atac/ & done
wait
find -name *.sra |xargs -I '{}' mv {} ~/LX/atac/sra/ #移动当前目录下所有的.sra文件,到指定目录
find -type d |grep "SRR.*" |xargs rm -rf #删除文件夹
LY1

for i in `cat acc_list.txt`; do fasterq-dump sra/${i}.sra --split-3 -O raw/ & done #批量将.sra文件转为fastq文件【注意sra-tools版本不一样,可能导致输出的fastq文件后缀有所差别,这里使用的是conda install sra-tools=2.10.8;如果使用sra-tools=2.11.0,输出结果的文件后缀就会有点差别】
wait

#质控
for i in `cat acc_list.txt`; do fastqc raw/${i}.sra* -o qc/ & done
wait
cd qc
multiqc -n "raww_qc" -s ../qc/
cd ..
for i in `cat acc_list.txt`; do trim_galore --phred33 -q 25 --length 35 --stringency 4 -e 0.1 --fastqc --paired -o clean raw/${i}.sra_1.fastq raw/${i}.sra_2.fastq & done #-o结果输出到clean文件夹;--fastqc表示完成后会直接进行fastqc
wait
cd qc
multiqc -n "clean_qc" -s ../clean/
cd ..

#cutadapt -a AGATGTGTATAAGAGACAG -A AGATGTGTATAAGAGACAG -q 10 --trim-n -m 10 -o Rep1_trim.R1.fq -p Rep1_trim.R2.fq Rep1_R1.fq.gz Rep1_R2.fq.gz
#cutadapt -a AGATGTGTATAAGAGACAG -A AGATGTGTATAAGAGACAG -q 10 --trim-n -m 10 -o Rep2_trim.R1.fq -p Rep2_trim.R2.fq Rep2_R1.fq.gz Rep2_R2.fq.gz

#第二步 建立index:一些物种可以直接从bowtie官网下载索引: http://bowtie-bio.sourceforge.net/index.shtml
#第三步 比对【bowtie2参考:https://blog.csdn.net/herokoking/article/details/77847384】
#for i in `cat acc_List.txt`; do nohup bowtie2 -x /home/data/vip10t01/data/mm10 -1 clean/${i}.sra_1_val_1.fq -2 clean/${i}.sra_2_val_2.fq -S ${i}.sam & done
#wait
#如果为单末端测序的话,上述命令换为:
#bowtie2 -p 6 -3 5 --local -x mm10 -U /opt/sdc/SRR/example.fastq -S example.sam
#bowtie2 -p 10 -x genome_index -1 input_1.fq -2 input_2.fq | samtools sort -O bam -@ 10 -o - > output.bam #合并多个命令

for i in `cat acc_list.txt`; do bowtie2 -x ${bt2_index} -1 clean/${i}.sra_1_val_1.fq -2 clean/${i}.sra_2_val_2.fq -X 2000 --very-sensitive -S ${i}.sam & done #注意变量${bt2_index}
wait

#第四步 挑选可靠的比对结果
mv *.sam align/
for i in `cat acc_list.txt`; do samtools view -b -f 2 -q 30 -o align/${i}.bam align/${i}.sam & done
wait
#samtools view -b -f 2 -q 30 -o Rep1.pairs.bam Rep1.sam
#samtools view -b -f 2 -q 30 -o Rep2.pairs.bam Rep2.sam
#samtools view Rep1.pairs.sort.bam |less -S

#第五步 去除PCR重复【picards(GATK软件包包装了picard,因此也可使用)或sambamba】
for i in `cat acc_list.txt`; do samtools sort -o align/${i}_sort.bam align/${i}.bam & done
wait
#samtools sort -o Rep1.pairs.sort.bam Rep1.pairs.bam
#samtools sort -o Rep2.pairs.sort.bam Rep2.pairs.bam
for i in `cat acc_list.txt`; do samtools index align/${i}_sort.bam & done
wait
for i in `cat acc_list.txt`; do samtools flagstat align/${i}_sort.bam > align/${i}_raw_stat & done #状态保存
wait
for i in `cat acc_list.txt`; do picard MarkDuplicates REMOVE_DUPLICATES=true I=align/${i}_sort.bam O=align/${i}_sort_redup.bam M=align/${i}.duplicates.log & done #注意conda安装的picard的调用方法
wait
for i in `cat acc_list.txt`;do sambamba markdup --overflow-list-size 600000 --tmpdir='./' -r align/${i}_sort.bam align/${i}_sort_redup_sb.bam & done #第二个去PCR重复软件,可以对比一下结果
wait
#java -jar picard.jar MarkDuplicates REMOVE_DUPLICATES=true I=Rep1.pairs.sort.bam O=Rep1.pairs.sort.dedup.bam M=Rep1.duplicates.log
#java -jar picard.jar MarkDuplicates REMOVE_DUPLICATES=true I=Rep2.pairs.sort.bam O=Rep2.pairs.sort.dedup.bam M=Rep2.duplicates.log
for i in `cat acc_list.txt`; do samtools index align/${i}_sort_redup.bam & done
wait
for i in `cat acc_list.txt`; do samtools flagstat align/${i}_sort_redup.bam > align/${i}_redup_stat & done #对比raw_stat和reduo_stat就可以知道去除了什么
wait
#第六步 查看线粒体污染【建立索引】
#samtools index Rep1.pairs.sort.dedup.bam
#samtools index Rep2.pairs.sort.dedup.bam

#第七步 去除线粒体污染【从下载的地方可知,本例中序列编号NC_012920.1表示线粒体】
for i in `cat acc_list.txt`; do samtools view -h align/${i}_sort_redup.bam |grep -v "chrM" |samtools view -bS -o align/${i}.final.bam & done
wait
#samtools view -h Rep1.pairs.sort.dedup.bam |grep -v "chrM" |grep -v "Pt" |samtools view -bS -o Rep1.final.bam
#samtools view -h Rep2.pairs.sort.dedup.bam |grep -v "chrM" |grep -v "Pt" |samtools view -bS -o Rep2.final.bam

#第八步 统计片段插入长度分布
for i in `cat acc_list.txt`; do samtools view -f 0x40 align/${i}.final.bam |perl -ane 'print abs($F[8]);print "\n";' |sort -n > align/${i}_insertSiz.txt & done #提取插入片段长度
wait
#plotinsertSize Rep1.final.bam Rep1
#plotinsertSize Rep2.final.bam Rep2
for i in `cat acc_list.txt`; do Rscript ${R_scrip01} align/${i}_insertSiz.txt align/${i}_insertSiz.png & done #作图。注意脚本的引入
wait

#第九步 peak calling【macs2或generich】
for i in `cat acc_list.txt`; do bedtools bamtobed -i align/${i}.final.bam > align/${i}.final.bed & done
wait
#bedtools bamtobed -i Rep1.final.bam >Rep1.final.bed
#bedtools bamtobed -i Rep2.final.bam >Rep2.final.bed

for i in `cat acc_list.txt`; do macs2 callpeak -t align/${i}.final.bed -n peaks/${i} --shift -100 --extsize 200 --nomodel -B -g mm & done
wait
#macs2 callpeak -t Rep1.final.bed -n Rep1 --shift -100 --extsize 200 --nomodel -B SPMR -g 2.4e8 #没有SPMR这个参数
#macs2 callpeak -t Rep2.final.bed -n Rep2 --shift -100 --extsize 200 --nomodel -B SPMR -g 2.4e8

#Genrich进行call peak
for i in `cat acc_list.txt`; do samtools sort -n align/${i}.final.bam -o align/${i}_final_sort.bam & done #必须-n排序
wait
for i in `cat acc_list.txt`; do Genrich -t align/${i}_final_sort.bam -o peaks/${i}_gr.narrowPeak & done
wait
#第十步 peak的重复性,看出两次实验的可重复性
#method 1 直接看数值
#bedtools intersect -a SRR2927015_peaks.narrowPeak -b SRR2927015_peaks.narrowPeak |wc -l
#bedtools intersect -a SRR2927015_peaks.narrowPeak -b SRR2927015_peaks.narrowPeak -f 0.5 -F 0.5 |wc -l #统计超过阈值的重复peaks
#method 2 利用idr出图
#idr -s SRR2927015_peaks.narrowPeak SRR2927016_peaks.narrowPeak -o 15-16_idrResult --plot
#method 3 出Venn图【本地绘制的】

#第十一步 peak在基因上的分布
Rscript PeakAnnotation.R ../index/genes.gtf Rep1_peaks.narrowPeak Rep1
Rscript PeakAnnotation.R ../index/genes.gtf Rep2_peaks.narrowPeak Rep2

#deeptools获取bw文件,利于IGV的可视化
ls align/*.final.bam |xargs -i samtools index {}
for i in `cat acc_list.txt`; do bamCoverage --normalizeUsing CPM -b align/${i}.final.bam -o igv/${i}.final.bw & done #运行前先建索引
wait

#获取ref.bed文件: 从UCSC的Table Browser下载【只需要里面的坐标位置信息就行】

#第十二步 TSS富集
#replicate 1
#bedSort Rep1_treat_pileup.bdg stdout |bedClip -truncate stdin ../index/ref.fa.fai stdout |perl -ane 'print if($F[1]<$F[2])' >Rep1_treat_pileup.bedGraph
#bedGraphToBigWig Rep1_treat_pileup.bedGraph ../index/ref.fa.fai Rep1.bw
#replicate 2
#bedSort Rep2_treat_pileup.bdg stdout |bedClip -truncate stdin ../index/ref.fa.fai stdout |perl -ane 'print if($F[1]<$F[2])' >Rep2_treat_pileup.bedGraph
#bedGraphToBigWig Rep2_treat_pileup.bedGraph ../index/ref.fa.fai Rep2.bw
#computeMatrix reference-point -S Rep1.bw Rep2.bw -R ../index/genes.gtf -a 3000 -b 3000 -p 1 -o tss3k.matrix.gz
computeMatrix reference-point --referencePoint TSS -p 5 -b 3000 -a 3000 -R ${computeMatrix_tmp} -S igv/*.final.bw --skipZeros -o tss/matrix_tss3k.gz --outFileSortedRegions tss/regions1k_genes.bed &
wait
plotHeatmap -m tss/matrix_tss3k.gz -out tss/tss3k_heatmap.png --colorMap Reds &
plotHeatmap -m tss/matrix_tss3k.gz -out tss/tss3k_heatmap.pdf --plotFileFormat pdf --dpi 720 &

#第十三步 peaks的注释
#method 1 本地使用ChIPseeker包
#method 2 使用homer。用conda安装好homer后,还需要使用附带的脚本来下载对应的数据库
#上游是基于mm10找的peaks,因此要下载mm10的数据库。

#对最终的.narrowPeaks做motif分析
##method_1 homer【https://www.jianshu.com/p/467d970ec097】
#conda install homer
#which homer #找到安装homer的路径,要找到该路径下的configureHomer.pl执行文件,然后执行如下命令,下载对应的注释库
#perl configureHomer.pl -install mm10 #下载数据库
for i in `cat acc_list.txt`; do findMotifsGenome.pl peaks/${i}_peaks.narrowPeak mm10 motif/${i}_motifDir -len 8,10,12 & done
wait

echo "The pipeline was finished"
end_time=$(date +%s)
spend_time=$((end_time - start_time))
echo Time taken to execute commands is $spend seconds.

所需附加代码

#02_plotInsertSize.R
#ly20211217
###########################
cmd=commandArgs(trailingOnly=TRUE);
input=cmd[1];
output=cmd[2];
d=read.table(input);
png(file=output);
hist(abs(d$V1), main="Insertion Size distribution", ylab="Read Count", xlab="Insert Size", xaxt="n", col="pink", breaks=function(x) c(0:ceiling(max(x))));
axis(side=1, at=seq(0, max(d$V1)), labels=seq(0, max(d$V1)));
dev.off()

 

标签:实战,sort,samtools,seq,align,ATAC,Rep2,Rep1,bam
来源: https://www.cnblogs.com/ly-zy/p/15694744.html