VHH高通量测序数据分析---版本3
作者:互联网
#Round3与round2的对标
#就是希望根据round2的cluster id and representative_CDR3然后命名round3
#然后还需要将round2的unique protein 以及round3的unique protein对标出来
#/usr/bin/bash
#coding=utf-8
#$1 是文件所在目录
#$2 是roudn2的文件
#$3 是round3的文件
dir=$1
round2=$2
round3=$3
#前期使用NGS_1.py已经粗略的筛选过了,文件为${round3}.test1.txt
#此时需要将round2的cluster id and representative_CDR3 以及unique protein提取出来
/usr/bin/python3 /public/home/djs/huiyu/NGS_5.py ${dir} ${round3} ${round2}
#新出现的CDR3聚类
awk 'BEGIN{FS="\t"} NR > 1 {print ">"$1;print $6}' ${dir}/Round2.recluster.txt > ${dir}/cluster.fa
~/software/cd-hit-v4.8.1-2019-0228/cd-hit -i ${dir}/cluster.fa -o ${dir}/cluster.test -c 0.8 -g 1 -n 4 -d 0 -l 4 -S 0 -sc 1
cat ${dir}/cluster.test.clstr |sed 's/.*>/>/g'|sed 's/\..*//g' |tr -d '\n' | sed 's/>C/\nC/g' | awk 'BEGIN{FS=">"} {for(i=2;i<=NF;i++){print $1"\t"$i}}' > ${dir}/test.cluster.txt
sed -i '1i cluster_id\tsequence_id' ${dir}/test.cluster.txt
###########
#新出现的CDR3聚类后进行后续分析
/usr/bin/python3 /public/home/djs/huiyu/NGS_6.py ${dir} ${round3} ${round2}
#!/usr/bin/python3
#coding = utf-8
#NGS_5.py
import sys
import os
import numpy as np
import pandas as pd
import Levenshtein
#from subprocess import call
Dir = sys.argv[1]
Round3 = sys.argv[2]
Round2 = sys.argv[3]
os.chdir(Dir)
#将round2的cluster id and representative_CDR3保存到df1
#df1 = pd.read_excel("NB259-h-R2-1_S9_L001_R1_001.fastq.gz.trim_1P.fastq.gz.merge.out.trim_vector.sort.sort.sort.txt.result.xlsx",sheet_name="table_of_unique_cluster",engine="openpyxl")
df1 = pd.read_csv(Round2+".result.xlsx",sheet_name="table_of_unique_cluster",engine="openpyxl")
df1 = df1.loc[:,["cluster_id","cdr3_aa"]]
#将Round3过滤后的序列读取进来
#df2 = pd.read_csv("NB259-C-R3-1_S11_L001_R1_001.fastq.gz.trim_1P.fastq.gz.merge.out.trim_vector.sort.sort.sort.txttest1.txt",sep="\t")
df2 = pd.read_csv(Round3+"test1.txt",sep="\t")
df2 = df2.rename(columns={"cluster":"cluster_id"})
#根据round2的信息命名round3的cluster
list1 = list(df2.cdr3_aa)
dictionary = dict(zip(list(df1.cluster_id),list(df1.cdr3_aa)))
x = [] #储存cluster
for i in list1:
list3 = [1000000] #储存key
list4 = [1000000] #储存value
for key,value in dictionary.items():
if len(i) == len(value) and Levenshtein.hamming(i,value)/len(i) <= 0.2:
list3.append(key)
list4.append(Levenshtein.hamming(i,value))
index = list4.index(min(list4))
x.append(list3[index])
del list3
del list4
#尝试将df3的cdr3与Round2的CDR3对标起来
df2["cluster_id"] = x
df3 = pd.merge(df2,df1,on="cluster_id",how="left")
#new_cluster如何编号?
#在python中搞不定,去shell中
df20 = df3.loc[df3.cluster_id == 1000000]
df21 = df3.loc[df3.cluster_id != 1000000]
df20.to_csv("Round2.recluster.txt",sep = "\t",index=False)
df21.to_csv("Round2.txt",sep = "\t",index=False)
#########################
#!/usr/bin/python3
#coding = utf-8
#NGS_6.py
import sys
import os
import numpy as np
import pandas as pd
import Levenshtein
#from subprocess import call
Dir = sys.argv[1]
Round3 = sys.argv[2]
Round2 = sys.argv[3]
os.chdir(Dir)
df20 = pd.read_csv("Round2.recluster.txt",sep = "\t")
df21 = pd.read_csv("Round2.txt",sep = "\t")
df22 = pd.read_csv("test.cluster.txt",sep="\t")
#有点弯弯绕绕的,结果就是将round1新出线的CDR3给个clusterID然后制造成与df21结构一样的数据框
df20 = pd.merge(df20,df22,on="sequence_id",how="right")
df20.cluster_id_y = df20.cluster_id_y.map(lambda x:x.split()[1])
df20.cluster_id_y = df20.cluster_id_y.astype("int64")
df20["cluster"] = df20["cluster_id_x"] + df20["cluster_id_y"]
df20 = df20.loc[:,["sequence_id","sequence","v_call","j_call","sequence_alignment_aa","cdr3_aa_x","cDNA_contig","counts_of_contig_in_the_same_AA","cluster","cdr3_aa_y"]]
df20 = df20.rename(columns={"cluster":"cluster_id"})
#把round3新出现的CDR3聚类后添加到NGS_5.py产生的结果后面
df3 = pd.concat([df21,df20],ignore_index=True)
df3.cDNA_contig = df3.cDNA_contig.astype("int64")
df3 = df3.rename(columns={"cdr3_aa_x":"cdr3_aa","cdr3_aa_y":"representative_CDR3"})
#df3.DNA_count.dtype
df4 = df3.groupby(["v_call","j_call","cluster_id"]).cDNA_contig.sum() #xxxx
df4.name = "counts_of_contig_in_the_same_clonotype"
df5 = pd.merge(df3,df4,on=["v_call","j_call","cluster_id"])
df5.cluster_id = df5.cluster_id.astype("string")
df5["clonotype"] = df5["cluster_id"]+"@"+df5["v_call"].map(lambda x:x.split("*")[0])+"@"+df5["j_call"].map(lambda x:x.split("*")[0])
#
df6 = df5.groupby("cluster_id").cDNA_contig.sum()
df6.name = "counts_of_contig_in_the_same_cluster"
df6 = pd.merge(df5,df6,on="cluster_id") #
df7 = df6.loc[:,["clonotype","counts_of_contig_in_the_same_clonotype","cluster_id","counts_of_contig_in_the_same_cluster","cdr3_aa","representative_CDR3","counts_of_contig_in_the_same_AA","cDNA_contig","v_call","j_call","sequence_alignment_aa","sequence"]]
#这就是总表了啊
df7 = df7.sort_values("counts_of_contig_in_the_same_cluster",ascending=False)
df7.cluster_id = df7.cluster_id.astype("int64")
#还需要将新出现的cluster根据round2的cluster标号,即round3中出现了大于1000000的cluster id
#df8 = pd.read_excel("NB259-h-R2-1_S9_L001_R1_001.fastq.gz.trim_1P.fastq.gz.merge.out.trim_vector.sort.sort.sort.txt.result.xlsx",sheet_name="table_of_unique_cluster",engine="openpyxl")
df8 = pd.read_excel(Round2+".result.xlsx",sheet_name="table_of_unique_cluster",engine="openpyxl")
df8 = df8.loc[:,["cluster_id","cdr3_aa"]]
#df8 = pd.read_csv(Round1+".dictionary.txt",sep="\t",names=["cluster","represent"])
y = pd.Series(list(df7.cluster_id.drop_duplicates()))
x = pd.Series(list(df7.cluster_id.drop_duplicates()))
count = max(df8.cluster_id) + 1
num = 0
while num < len(x):
if x[num] >= 1000000:
x[num] = count
count = count + 1
num = num + 1
df8 = pd.concat([y,x],axis=1)
df8 = df8.rename(columns={0:"cluster_id",1:"cluster"})
df7 = pd.merge(df7,df8,on="cluster_id",how="left")
df7 = df7.loc[:,["clonotype","counts_of_contig_in_the_same_clonotype","cluster","counts_of_contig_in_the_same_cluster","cdr3_aa","representative_CDR3","counts_of_contig_in_the_same_AA","cDNA_contig","v_call","j_call","sequence_alignment_aa","sequence"]]
df7 = df7.rename(columns={"cluster":"cluster_id"})
#此时总表才完成
#####################################
#table of unique protein
df8 = df7.sort_values(["counts_of_contig_in_the_same_AA","cDNA_contig"],ascending=False) #
df8 = df8.drop_duplicates(subset="sequence_alignment_aa",keep="first") #51816行
#还有一个挑战的问题,根据Round2标记Round3中新出现的protein
#此时应该与round2运行NGS_1.py后的数据进行对比
#df_8 = pd.read_csv("NB259-h-R2-1_S9_L001_R1_001.fastq.gz.trim_1P.fastq.gz.merge.out.trim_vector.sort.sort.sort.txttest1.txt",sep="\t")
df_8 = pd.read_csv(Round2+"test1.txt",sep="\t")
#定义一个函数
def Protein(str1):
for str2 in list(df_8.sequence_alignment_aa.drop_duplicates()):
if str1 == str2:
return False
else:
continue
return True
df8["new_protein"] = df8.sequence_alignment_aa.apply(lambda str1:Protein(str1)) #超级限速步骤,在想着怎么优化
#table of unique clonotype
df10 = df7.sort_values(["cDNA_contig","counts_of_contig_in_the_same_AA","counts_of_contig_in_the_same_cluster"],ascending=False)
df10 = df10.drop_duplicates(subset="clonotype", keep='first') #52行
#table of unique cluster
df11 = df7.sort_values(["counts_of_contig_in_the_same_cluster","cDNA_contig","counts_of_contig_in_the_same_AA"],ascending=False)
df11 = df11.drop_duplicates(subset="cluster_id", keep='first') #47行
#开始最后的分析表格制作了,头大
#table of analysis
df12 = df11.loc[:,["cluster_id","counts_of_contig_in_the_same_cluster","cdr3_aa","representative_CDR3","cDNA_contig"]]
#df12["frequency_of_total"] = df12.counts_of_contig_in_the_same_cluster/df3.cDNA_contig.sum()
df12["frequency"] = df12.counts_of_contig_in_the_same_cluster/df12.counts_of_contig_in_the_same_cluster.sum()
df12["CDR3_length"] = df12.cdr3_aa.apply(lambda x:len(x))
df12 = df12.loc[:,["cluster_id","counts_of_contig_in_the_same_cluster","frequency","cdr3_aa","representative_CDR3","CDR3_length","cDNA_contig"]]
df12.cluster_id = df12.cluster_id.astype("string")
#df12 = df12.sort_values("cluster_id")
#忘记了还要把蛋白种类提出来,WC
#用table of unique protein 制作这个数据
df16 = df8.groupby("cluster_id",as_index=False).agg("count")
df16 = df16.loc[:,["cluster_id","sequence_alignment_aa"]]
df16 = df16.rename(columns={"sequence_alignment_aa":"variety_of_protein"})
df16.cluster_id = df16.cluster_id.astype("string")
df12 = pd.merge(df12,df16,on="cluster_id",how="inner")
df14 = df12.rename(columns={"cluster_id":"cluster","counts_of_contig_in_the_same_cluster":"No_of_VH_seq","representative_CDR3":"Representative_CDR3","cdr3_aa":"Most_abundant_CDR3","frequency":"VH_frequency","variety_of_protein":"No_of_unique_VH_seq"})
#按列排序
df14 = df14.loc[:,["cluster","No_of_unique_VH_seq","No_of_VH_seq","VH_frequency","Most_abundant_CDR3","CDR3_length","Representative_CDR3"]]
#标记new family
df14["new_family"] = df14.Representative_CDR3.isna()
x = list(df14.new_family)
i = 0
while i < len(x):
if x[i] == False:
x[i] = ""
i = i+1
else:
x[i] = "new"
i = i+1
df14.new_family = x
#将一些总结性的数据放在表头
df15 = pd.read_csv(Round3+".count.txt",sep="\t")
df15.iloc[2,1] = df3.cDNA_contig.sum()
df15.iloc[3,1] = df11.cluster_id.count()
df15.iloc[4,1] = df8.sequence_alignment_aa.count()
#写入文件
with pd.ExcelWriter(Round3+".vs_Round2.xlsx") as writer:
df7.to_excel(writer, sheet_name="all_of_contig",index=False)
df8.to_excel(writer, sheet_name="table_of_unique_protein",index=False)
df10.to_excel(writer, sheet_name="table_of_unique_clonotype",index=False)
df11.to_excel(writer, sheet_name="table_of_unique_cluster",index=False)
df14.to_excel(writer, sheet_name="table_of_analysis",index=False,startrow=8)
df15.to_excel(writer,sheet_name="table_of_analysis",index=False,startrow=0)
标签:aa,contig,测序,same,---,cluster,counts,高通量,id 来源: https://blog.csdn.net/jiangshandaiyou/article/details/120220602