ATAC-seq实战代码
作者:互联网
自己整理,测试完全可用。从直接下载数据到一般的出图
#!/usr/bin/bash #ly_20211215_atac-seq pepiline #从下载数据到分析出图的一般流程 ###################################### start_time=$(date +%s) #必要索引文件 bt2_index=/home/data/ssy49/data/index/mm10 #一些物种可以直接从bowtie官网下载索引: http://bowtie-bio.sourceforge.net/index.shtml computeMatrix_tmp=/home/data/ssy49/data/index/ucsc_mm10_RefSeq_par.bed #从UCSC下载 R_scrip01=/home/data/ssy49/manscript/R/02_plotInsertSize.R #R包代码看下面 #建立文件夹 #mkdir -p ~/LX/atac #cd ~/LX/atac #:<<"LY1" mkdir sra raw qc clean align peaks motif igv tss :<<"LY1" temp_dir=$(dirname `which conda`) dir_list=$(cd ${temp_dir};cd ../envs;ls) etc_dir=$(cd ${temp_dir};cd ../etc/profile.d/) echo ${dir_list} echo ${dir_list} result=$(echo ${dir_list} |grep "atac") if [[ "${result}" != "" ]];then echo "The atac dir is exist" source ${etc_dir}conda.sh conda activate atac else echo "OK! I will create the atac environment" conda create -n atac -y conda activate atac conda install -y bedtools homer sra-tools deeptools homer meme bowtie bowtie2 trim-galore picard bedtools macs2 genrich sambamba idr fi LY1 ##数据准备 #构建所需数据的列表acc_list.txt #下载数据SRP173505,先获得要下载数据的列表 #nohup prefetch --option-file acc_List.txt --output-directory /home/data/vip10t01/LX/atac & #【这种是单线程下载,并行还是要用循环】这种下载每个文件会对应单独的文件夹,为方便分析,需要将所有的.SRA放在同一文件夹中 for i in `cat acc_list.txt`; do prefetch ${i} --output-directory ~/LX/atac/ & done wait find -name *.sra |xargs -I '{}' mv {} ~/LX/atac/sra/ #移动当前目录下所有的.sra文件,到指定目录 find -type d |grep "SRR.*" |xargs rm -rf #删除文件夹 LY1 for i in `cat acc_list.txt`; do fasterq-dump sra/${i}.sra --split-3 -O raw/ & done #批量将.sra文件转为fastq文件【注意sra-tools版本不一样,可能导致输出的fastq文件后缀有所差别,这里使用的是conda install sra-tools=2.10.8;如果使用sra-tools=2.11.0,输出结果的文件后缀就会有点差别】 wait #质控 for i in `cat acc_list.txt`; do fastqc raw/${i}.sra* -o qc/ & done wait cd qc multiqc -n "raww_qc" -s ../qc/ cd .. for i in `cat acc_list.txt`; do trim_galore --phred33 -q 25 --length 35 --stringency 4 -e 0.1 --fastqc --paired -o clean raw/${i}.sra_1.fastq raw/${i}.sra_2.fastq & done #-o结果输出到clean文件夹;--fastqc表示完成后会直接进行fastqc wait cd qc multiqc -n "clean_qc" -s ../clean/ cd .. #cutadapt -a AGATGTGTATAAGAGACAG -A AGATGTGTATAAGAGACAG -q 10 --trim-n -m 10 -o Rep1_trim.R1.fq -p Rep1_trim.R2.fq Rep1_R1.fq.gz Rep1_R2.fq.gz #cutadapt -a AGATGTGTATAAGAGACAG -A AGATGTGTATAAGAGACAG -q 10 --trim-n -m 10 -o Rep2_trim.R1.fq -p Rep2_trim.R2.fq Rep2_R1.fq.gz Rep2_R2.fq.gz #第二步 建立index:一些物种可以直接从bowtie官网下载索引: http://bowtie-bio.sourceforge.net/index.shtml #第三步 比对【bowtie2参考:https://blog.csdn.net/herokoking/article/details/77847384】 #for i in `cat acc_List.txt`; do nohup bowtie2 -x /home/data/vip10t01/data/mm10 -1 clean/${i}.sra_1_val_1.fq -2 clean/${i}.sra_2_val_2.fq -S ${i}.sam & done #wait #如果为单末端测序的话,上述命令换为: #bowtie2 -p 6 -3 5 --local -x mm10 -U /opt/sdc/SRR/example.fastq -S example.sam #bowtie2 -p 10 -x genome_index -1 input_1.fq -2 input_2.fq | samtools sort -O bam -@ 10 -o - > output.bam #合并多个命令 for i in `cat acc_list.txt`; do bowtie2 -x ${bt2_index} -1 clean/${i}.sra_1_val_1.fq -2 clean/${i}.sra_2_val_2.fq -X 2000 --very-sensitive -S ${i}.sam & done #注意变量${bt2_index} wait #第四步 挑选可靠的比对结果 mv *.sam align/ for i in `cat acc_list.txt`; do samtools view -b -f 2 -q 30 -o align/${i}.bam align/${i}.sam & done wait #samtools view -b -f 2 -q 30 -o Rep1.pairs.bam Rep1.sam #samtools view -b -f 2 -q 30 -o Rep2.pairs.bam Rep2.sam #samtools view Rep1.pairs.sort.bam |less -S #第五步 去除PCR重复【picards(GATK软件包包装了picard,因此也可使用)或sambamba】 for i in `cat acc_list.txt`; do samtools sort -o align/${i}_sort.bam align/${i}.bam & done wait #samtools sort -o Rep1.pairs.sort.bam Rep1.pairs.bam #samtools sort -o Rep2.pairs.sort.bam Rep2.pairs.bam for i in `cat acc_list.txt`; do samtools index align/${i}_sort.bam & done wait for i in `cat acc_list.txt`; do samtools flagstat align/${i}_sort.bam > align/${i}_raw_stat & done #状态保存 wait for i in `cat acc_list.txt`; do picard MarkDuplicates REMOVE_DUPLICATES=true I=align/${i}_sort.bam O=align/${i}_sort_redup.bam M=align/${i}.duplicates.log & done #注意conda安装的picard的调用方法 wait for i in `cat acc_list.txt`;do sambamba markdup --overflow-list-size 600000 --tmpdir='./' -r align/${i}_sort.bam align/${i}_sort_redup_sb.bam & done #第二个去PCR重复软件,可以对比一下结果 wait #java -jar picard.jar MarkDuplicates REMOVE_DUPLICATES=true I=Rep1.pairs.sort.bam O=Rep1.pairs.sort.dedup.bam M=Rep1.duplicates.log #java -jar picard.jar MarkDuplicates REMOVE_DUPLICATES=true I=Rep2.pairs.sort.bam O=Rep2.pairs.sort.dedup.bam M=Rep2.duplicates.log for i in `cat acc_list.txt`; do samtools index align/${i}_sort_redup.bam & done wait for i in `cat acc_list.txt`; do samtools flagstat align/${i}_sort_redup.bam > align/${i}_redup_stat & done #对比raw_stat和reduo_stat就可以知道去除了什么 wait #第六步 查看线粒体污染【建立索引】 #samtools index Rep1.pairs.sort.dedup.bam #samtools index Rep2.pairs.sort.dedup.bam #第七步 去除线粒体污染【从下载的地方可知,本例中序列编号NC_012920.1表示线粒体】 for i in `cat acc_list.txt`; do samtools view -h align/${i}_sort_redup.bam |grep -v "chrM" |samtools view -bS -o align/${i}.final.bam & done wait #samtools view -h Rep1.pairs.sort.dedup.bam |grep -v "chrM" |grep -v "Pt" |samtools view -bS -o Rep1.final.bam #samtools view -h Rep2.pairs.sort.dedup.bam |grep -v "chrM" |grep -v "Pt" |samtools view -bS -o Rep2.final.bam #第八步 统计片段插入长度分布 for i in `cat acc_list.txt`; do samtools view -f 0x40 align/${i}.final.bam |perl -ane 'print abs($F[8]);print "\n";' |sort -n > align/${i}_insertSiz.txt & done #提取插入片段长度 wait #plotinsertSize Rep1.final.bam Rep1 #plotinsertSize Rep2.final.bam Rep2 for i in `cat acc_list.txt`; do Rscript ${R_scrip01} align/${i}_insertSiz.txt align/${i}_insertSiz.png & done #作图。注意脚本的引入 wait #第九步 peak calling【macs2或generich】 for i in `cat acc_list.txt`; do bedtools bamtobed -i align/${i}.final.bam > align/${i}.final.bed & done wait #bedtools bamtobed -i Rep1.final.bam >Rep1.final.bed #bedtools bamtobed -i Rep2.final.bam >Rep2.final.bed for i in `cat acc_list.txt`; do macs2 callpeak -t align/${i}.final.bed -n peaks/${i} --shift -100 --extsize 200 --nomodel -B -g mm & done wait #macs2 callpeak -t Rep1.final.bed -n Rep1 --shift -100 --extsize 200 --nomodel -B SPMR -g 2.4e8 #没有SPMR这个参数 #macs2 callpeak -t Rep2.final.bed -n Rep2 --shift -100 --extsize 200 --nomodel -B SPMR -g 2.4e8 #Genrich进行call peak for i in `cat acc_list.txt`; do samtools sort -n align/${i}.final.bam -o align/${i}_final_sort.bam & done #必须-n排序 wait for i in `cat acc_list.txt`; do Genrich -t align/${i}_final_sort.bam -o peaks/${i}_gr.narrowPeak & done wait #第十步 peak的重复性,看出两次实验的可重复性 #method 1 直接看数值 #bedtools intersect -a SRR2927015_peaks.narrowPeak -b SRR2927015_peaks.narrowPeak |wc -l #bedtools intersect -a SRR2927015_peaks.narrowPeak -b SRR2927015_peaks.narrowPeak -f 0.5 -F 0.5 |wc -l #统计超过阈值的重复peaks #method 2 利用idr出图 #idr -s SRR2927015_peaks.narrowPeak SRR2927016_peaks.narrowPeak -o 15-16_idrResult --plot #method 3 出Venn图【本地绘制的】 #第十一步 peak在基因上的分布 Rscript PeakAnnotation.R ../index/genes.gtf Rep1_peaks.narrowPeak Rep1 Rscript PeakAnnotation.R ../index/genes.gtf Rep2_peaks.narrowPeak Rep2 #deeptools获取bw文件,利于IGV的可视化 ls align/*.final.bam |xargs -i samtools index {} for i in `cat acc_list.txt`; do bamCoverage --normalizeUsing CPM -b align/${i}.final.bam -o igv/${i}.final.bw & done #运行前先建索引 wait #获取ref.bed文件: 从UCSC的Table Browser下载【只需要里面的坐标位置信息就行】 #第十二步 TSS富集 #replicate 1 #bedSort Rep1_treat_pileup.bdg stdout |bedClip -truncate stdin ../index/ref.fa.fai stdout |perl -ane 'print if($F[1]<$F[2])' >Rep1_treat_pileup.bedGraph #bedGraphToBigWig Rep1_treat_pileup.bedGraph ../index/ref.fa.fai Rep1.bw #replicate 2 #bedSort Rep2_treat_pileup.bdg stdout |bedClip -truncate stdin ../index/ref.fa.fai stdout |perl -ane 'print if($F[1]<$F[2])' >Rep2_treat_pileup.bedGraph #bedGraphToBigWig Rep2_treat_pileup.bedGraph ../index/ref.fa.fai Rep2.bw #computeMatrix reference-point -S Rep1.bw Rep2.bw -R ../index/genes.gtf -a 3000 -b 3000 -p 1 -o tss3k.matrix.gz computeMatrix reference-point --referencePoint TSS -p 5 -b 3000 -a 3000 -R ${computeMatrix_tmp} -S igv/*.final.bw --skipZeros -o tss/matrix_tss3k.gz --outFileSortedRegions tss/regions1k_genes.bed & wait plotHeatmap -m tss/matrix_tss3k.gz -out tss/tss3k_heatmap.png --colorMap Reds & plotHeatmap -m tss/matrix_tss3k.gz -out tss/tss3k_heatmap.pdf --plotFileFormat pdf --dpi 720 & #第十三步 peaks的注释 #method 1 本地使用ChIPseeker包 #method 2 使用homer。用conda安装好homer后,还需要使用附带的脚本来下载对应的数据库 #上游是基于mm10找的peaks,因此要下载mm10的数据库。 #对最终的.narrowPeaks做motif分析 ##method_1 homer【https://www.jianshu.com/p/467d970ec097】 #conda install homer #which homer #找到安装homer的路径,要找到该路径下的configureHomer.pl执行文件,然后执行如下命令,下载对应的注释库 #perl configureHomer.pl -install mm10 #下载数据库 for i in `cat acc_list.txt`; do findMotifsGenome.pl peaks/${i}_peaks.narrowPeak mm10 motif/${i}_motifDir -len 8,10,12 & done wait echo "The pipeline was finished" end_time=$(date +%s) spend_time=$((end_time - start_time)) echo Time taken to execute commands is $spend seconds.
所需附加代码
#02_plotInsertSize.R #ly20211217 ########################### cmd=commandArgs(trailingOnly=TRUE); input=cmd[1]; output=cmd[2]; d=read.table(input); png(file=output); hist(abs(d$V1), main="Insertion Size distribution", ylab="Read Count", xlab="Insert Size", xaxt="n", col="pink", breaks=function(x) c(0:ceiling(max(x)))); axis(side=1, at=seq(0, max(d$V1)), labels=seq(0, max(d$V1))); dev.off()
标签:实战,sort,samtools,seq,align,ATAC,Rep2,Rep1,bam 来源: https://www.cnblogs.com/ly-zy/p/15694744.html