2021-10-26 宏基因组 分析(个人笔记2)
作者:互联网
代码涉及软件、数据库都是绝对路径
source /home/dengqr/miniconda3/bin/activate
conda config --set auto_activate_base true
find *log |wc -l
#备份数据 【原始数据不动original】
cp -r 00data 00data2
#数据上次确认
ls -l | grep ".gz$" > 1.txt
查看虚拟环境列表
conda env list
创建虚拟环境,防污染环境变量,如果有的软件在Solving environment步骤数小时无法安装,可以新建环境
conda create -n meta
加载环境
conda activate meta
### 质量评估fastqc
# =为指定版本,-c指定安装源,均可加速安装
# -y为同意安装
conda install fastqc=0.11.9 -c bioconda -y
fastqc -v
### 评估报告汇总multiqc
# 注1.7为Python2环境,1.8/9新版本需要Python3的环境
conda install multiqc=1.9 -c bioconda -y
multiqc --version
### 质量控制流程kneaddata
conda install kneaddata=0.7.4 -c bioconda -y
kneaddata --version
trimmomatic -version # 0.39
bowtie2 --version # 2.4.2
db= /home/dengqr/dataset/metagenome/ #这里直接cd 到要放的目录吧
# 查看可用数据库
kneaddata_database
# 包括人基因组bowtie2/bmtagger、人类转录组、核糖体RNA和小鼠基因组
# 下载人基因组bowtie2索引 3.44 GB
mkdir -p $/home/dengqr/dataset/metagenome/kneaddata/human_genome
kneaddata_database --download human_genome bowtie2 $/home/dengqr/dataset/metagenome/kneaddata/human_genome
# 数据库下载慢或失败,附录有百度云和国内备份链接
下载到了 $/home/dengqr/home/dengqr/dataset/metagenome/kneaddata/
#mv /home/dengqr/$/home/dengqr/dataset/metagenome/kneaddata/ /home/dengqr/dataset/metagenome/ 【已完成】
## 1.2 (可选)FastQC质量评估
# 第一次使用软件要记录软件版本,文章方法中必须写清楚
fastqc --version # 0.11.8
# time统计运行时间,fastqc质量评估
# *.gz为原始数据,-t指定多线程
time fastqc seq/*.gz -t 2
▼ #32线程time= 27分钟 tip:在往上加
time fastqc -o 01fastqc 00data/*.gz -t 32
multiqc将fastqc的多个报告生成单个整合报告,方法批量查看和比较
# 记录软件版本
multiqc --version # 1.5
# 整理seq目录下fastqc报告,输出multiqc_report.html至result/qc目录
multiqc -d seq/ -o result/qc
▼
multiqc -d 01fastqc/ -o 02fastqc_result/
查看右侧result/qc目录中multiqc_report.html
#去除宿主
#索引目录 "/home/dengqr/dataset/metagenome/kneaddata/human_genome/hg37dec_v0.1.1.bt2"
kneaddata -h
#去宿主后双端不匹配——序列改名 先检查下
zcat 00data/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R2_001.fastq.gz |head -n 6
zcat 00data/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R1_001.fastq.gz |head -n 6
cp 00data/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R2_001.fastq.gz 00datatrain\
cp 00data/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R1_001.fastq.gz 00datatrain\
zcat 00datatrain/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R2_001.fastq.gz |head -n 6
zcat 00datatrain/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R1_001.fastq.gz |head -n 6
(可选) 序列改名,解决NCBI SRA数据双端ID重名问题,详见[《MPB:随机宏基因组测序数据质量控制和去宿主的分析流程和常见问题》](https://mp.weixin.qq.com/s/ovL4TwalqZvwx5qWb5fsYA)。
gunzip 00datatrain/*.gz
sed -i '1~4 s/$/\\1/g' 00datatrain/*R2_001.fastq
sed -i '1~4 s/$/\\2/g' 00datatrain/*R1_001.fastq
# 再次核对样本是否标签有重复
zcat 00datatrain/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R2_001.fastq |head -n 6
zcat 00datatrain/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R1_001.fastq |head -n 6
# 结果压缩节省空间
gzip seq/*.fq
# pigz是并行版的gzip,没装可使用为gzip
pigz seq/*.fq
time kneaddata -i 00data/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R2_001.fastq.gz \
-i 00data/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R1_001.fastq.gz \
-o 03qc -v -t 32 --remove-intermediate-output \
--reorder --bowtie2-options "--very-sensitive --dovetail" \
-db kneaddata/human_genome
### Java不匹配——重装Java运行环境
若出现错误 Unrecognized option: -d64,则安装java解决:
conda install -c cyclus java-jdk
/home/dengqr/miniconda2/bin/trimmomatic
type trimmomatic
type fastqc
"/home/dengqr/miniconda3/share/trimmomatic-0.39-2/"
time kneaddata -i 00data/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R2_001.fastq.gz \
-i 00data/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R1_001.fastq.gz \
-o 03qc -v -t 32 --remove-intermediate-output \
-db kneaddata/human_genome
time kneaddata -i 00data/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R2_001.fastq.gz \
-i 00data/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R1_001.fastq.gz \
-o 03qc -v -t 32 --remove-intermediate-output \
--trimmomatic home/dengqr/miniconda2/bin/trimmomatic \
--reorder --bowtie2-options "--very-sensitive --dovetail" \
-db kneaddata/human_genome
time kneaddata -t 40 -v \
-i 00data/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R2_001.fastq.gz \
-i 00data/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R1_001.fastq.gz \
-o 03qc/ \
--trimmomatic /home/dengqr/miniconda3/share/trimmomatic-0.39-2/ \
--max-memory 80g \
--trimmomatic-options "SLIDINGWINDOW:4:20 MINLEN:50" \
-db kneaddata/human_genome/ \
--bowtie2-options "--very-sensitive --dovetail --reoeder" \
--remove-intermediate-output
"/home/dengqr/miniconda2/bin/trimmomatic-0.33.jar"
"/home/dengqr/miniconda3/bin/trimmomatic"
▼
time kneaddata \
-i 00data/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R2_001.fastq.gz \
-i 00data/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R1_001.fastq.gz \
-o temp/qc -v -t 40 --remove-intermediate-output \
--trimmomatic /home/dengqr/miniconda3/share/trimmomatic-0.39-2/ \
--trimmomatic-options "SLIDINGWINDOW:4:20 MINLEN:50" \
--reorder --bowtie2-options "--very-sensitive --dovetail" \
-db kneaddata/human_genome
▼
time kneaddata \
-i 00data/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R2_001.fastq.gz \
-i 00data/OSCC35A_20211015NA_AGGCAGAA_S156_L002_R1_001.fastq.gz \
-o temp1/qc -v -t 40 --remove-intermediate-output \
--trimmomatic /home/dengqr/miniconda3/share/trimmomatic-0.39-2/ \
--trimmomatic-options "SLIDINGWINDOW:4:20 MINLEN:50" \
--reorder --bowtie2-options "--very-sensitive --dovetail" \
-db kneaddata/human_genome
▼
# 采用kneaddata附属工具kneaddata_read_count_table
kneaddata_read_count_table --input temp1/qc \
--output temp1/kneaddata.txt
# 筛选重点结果列
cut -f 1,2,4,12,13 temp1/kneaddata.txt | sed 's/_1_kneaddata//' > temp1/qc/sum.txt
cat temp1/qc/sum.txt
#质控结果
fastqc temp1/qc/*_1_kneaddata_paired_*.fastq -t 2 -o temp1
multiqc -d temp1/ -o temp1/
fastqc temp1/qc/*R2_001_kneaddata_paired_*.fastq -t 2 -o temp1
multiqc -d temp1/ -o temp1/
OSCC35A_20211015NA_AGGCAGAA_S156_L002_
"/home/dengqr/dataset/metagenome/00data/Control105A_R1_001.fastq.gz"
#多任务并行运行
→记得知情同意 # 打will cite承诺引用并行软件parallel
parallel --citation
parallel -j 3 --xapply "echo 00data/{1}_R1_001.fastq.gz 00data/{1}_R2_001.fastq.gz" ::: `tail -n+2 metadata.txt|cut -f1`
time parallel -j 2 --xapply \
"kneaddata -i 00data/{1}_R1_001.fastq.gz \
-i 00data/{1}_R2_001.fastq.gz \
-o temp/qc -v -t 40 --remove-intermediate-output \
--trimmomatic /home/dengqr/miniconda3/share/trimmomatic-0.39-2/ \
--trimmomatic-options 'SLIDINGWINDOW:4:20 MINLEN:50' \
--reorder --bowtie2-options '--very-sensitive --dovetail' \
-db kneaddata/human_genome" ::: `tail -n+2 metadata.txt|cut -f1`
##
# 安装conda-pack
conda install -c conda-forge conda-pack
#conda 安装不成功可用pip安装
# pip conda-pack
# 安装好的环境下打包导出
conda pack -n humann2 -o humann2.tar.gz
# 下载
wget -c http://210.75.224.110/db/humann2/humann2.tar.gz
# 新建文件夹存放humann2环境 "/home/dengqr/miniconda3/envs/"
mkdir -p /home/dengqr/miniconda3/envs/humann2
tar -xzf humann2.tar.gz -C /home/dengqr/miniconda3/envs/humann2
# 激活环境
source /home/dengqr/miniconda3/envs/humann2/bin/activate
测试流程是否可用 time=20分钟
humann2_test
humann2 --version
切换到cd mi 3/en /hu
humann2_databases --download utility_mapping full /home/dengqr/miniconda3/envs/humann2
# 微生物泛基因组 5.37 GB 【要完整路径】【手动创文件夹】
humann2_databases --download chocophlan full /home/dengqr/miniconda3/envs/humann2/chocophlan
# 功能基因diamond索引 10.3 GB 【注意文件夹位置】
humann2_databases --download uniref uniref90_diamond /home/dengqr/miniconda3/envs/humann2
#备用下载【注意切换】 →选择百度云
https://github.com/YongxinLiu/MicrobiomeStatPlot/blob/master/Data/BigDataDownlaodList.md
拉进去/home/dengqr/miniconda3/envs/humann2
tar xvzf uniref90_annotated_1_1.tar.gz
#查看数据框是否配置上了
humann2_config
mkdir temp/concat
# 双端合并为单个文件
for i in `tail -n+2 metadata.txt|cut -f1`;do
cat temp/qc/${i}_1_kneaddata_paired_?.fastq \
> temp/concat/${i}.fq;
done
# 查看样品数量和大小
ls -sh temp/concat/*.fq
标签:10,26,kneaddata,--,fastq,gz,001,2021,dengqr 来源: https://blog.csdn.net/weixin_46623488/article/details/120982780