其他分享
首页 > 其他分享> > VHH高通量测序数据分析----修改版

VHH高通量测序数据分析----修改版

作者:互联网

bash NGS_round1.sh $(pwd) round1.txt

#shell脚本
#!/usr/bin/bash

file=${2}
dir=${1}

/usr/bin/python3 /public/home/djs/huiyu/NGS_1.py ${dir} ${file}


awk 'BEGIN{FS="\t"} NR > 1 {print ">"$1;print $6}' ${dir}/${file}test1.txt  > ${dir}/cluster.fa

~/software/cd-hit-v4.8.1-2019-0228/cd-hit -i ${dir}/cluster.fa -o ${dir}/cluster.test  -c 0.75 -g 1 -n 4 -d 0 -l 4 -S 0  -sc 1 #王博希望相似度>=80%,cd-hit貌似只能选取>80%

##注意修改如下地方的路径
cat ${dir}/cluster.test.clstr |sed 's/.*>/>/g'|sed 's/\..*//g' |tr -d '\n' | sed 's/>C/\nC/g' | awk 'BEGIN{FS=">"} {for(i=2;i<=NF;i++){print $1"\t"$i}}' > ${dir}/test.cluster.txt

sed -i '1i cluster_id\tsequence_id' ${dir}/test.cluster.txt
######
#找到representative CDR3
#该文件不可以直接用于round2的对标分析,因为下文会改动cluster_id的数字
cat cluster.test.clstr | grep -A 1 ">C" > represent.txt

sed -i '/--/d' represent.txt

cat represent.txt |sed 's/.*>/>/g' | sed 's/\..*//g' |awk '{if(NR%2==1) print $0}' > cluster.name.txt ##

cat represent.txt |sed 's/.*>/>/g' | sed 's/\..*//g' |awk '{if(NR%2==0) print $0}' > index_name.txt  ##

paste cluster.name.txt index_name.txt > represent_cluster.txt

sed -i 's/>//g' index_name.txt

/public/home/qingw/CORE/bin/lt_create_idx cluster.fa
/public/home/qingw/CORE/bin/ltr_gwc cluster.fa index_name.txt  > represent_retrive.fa

awk '{if(NR%2==1) print $0}' represent_retrive.fa  > sequence_id.txt ###

awk '{if(NR%2==0) print $0}' represent_retrive.fa  > sequence_cdr3.txt ###

paste sequence_id.txt sequence_cdr3.txt > represent_retrieve_1.txt

paste represent_cluster.txt represent_retrieve_1.txt | cut -f 1,4 > ${file}.dictionary.txt  ##20210828修改
###representative CDR3已经提取出来

#总结性的信息统计
a=$(echo ${file} |cut -d "_" -f1)

#统计reads数目
echo -e "name\tcount" >> ${file}.count.txt   ##20210828修改
echo -e "reads\t$(cat ../../Fastqc/multiqc/multiqc_data/multiqc_fastqc.txt |grep "${a}" |cut -f 5 |uniq)" >> ${file}.count.txt

#统计contig
ls ../ | grep ${a} |grep -v "vector" | while read id ;do echo $id && awk '{if(NR%4 == 2) print $0}' ../$id | wc -l ;done > count.txt
awk 'NR==2 {print "contig\t" $0}' count.txt >> ${file}.count.txt

#统计VH contain contig
#20210906这一步在pandas中完成
echo -e "VHH_contain_contig\t3.14" >>  ${file}.count.txt  


#统计cluster
#需要在pandas中完成
echo -e "CDR3_cluster\t3.14" >>  ${file}.count.txt  ##20210828修改

#统计unique protein
#需要在pandas中完成
echo -e "unique_protein\t3.14" >>  ${file}.count.txt

/usr/bin/python3 /public/home/djs/huiyu/NGS_2.py ${dir} ${file} 

##################################################################################
#python脚本1 : NGS_1.py
#!/usr/bin/python3
#coding=utf-8

#Dir是第一个参数,表示文件所在目录
#File是第二个参数,表示要处理的文件


import sys
import os
import numpy as np
import pandas as pd

Dir = sys.argv[1]
File = sys.argv[2]
os.chdir(Dir)
df1 = pd.read_csv(File,sep='\t')
#粗筛一下
df2 = df1[(df1["stop_codon"] == 'F') & (df1["vj_in_frame"] == 'T') & (df1["v_frameshift"] == 'F') & (df1["productive"] == 'T') & (df1["complete_vdj"] == 'T')]
#提取有效信息
df2 = df2.loc[:,["sequence_id","sequence","v_call","j_call","sequence_alignment_aa","cdr3_aa"]]
#有些序列可能没有CDR3,丢掉了
df2 = df2.dropna(axis=0)
#根据蛋白序列对应的contig,粗筛(counts of contig in the same unique protein > 10)
df2["cDNA_contig"] = df2.sequence_id.map(lambda x:x.split("+")[1])
df2.cDNA_contig = df2.cDNA_contig.astype("int64")
df3 = df2.groupby("sequence_alignment_aa").cDNA_contig.sum()
df3.name = "counts_of_contig_in_the_same_AA"
df4 = pd.merge(df2,df3,on="sequence_alignment_aa",how="left")  #大约10万行
df4 = df4.loc[df4.counts_of_contig_in_the_same_AA > 10] #大约3万行
#保存,然后根据CDR3序列进行聚类
df4.to_csv(File+"test1.txt",sep="\t",index=False)

##########################

#python脚本2 : NGS_2.PY
#!/usr/bin/python3
#coding=utf-8

import sys
import os
import numpy as np
import pandas as pd
#from openpyxl import load_workbook

Dir = sys.argv[1]
File = sys.argv[2]
os.chdir(Dir)

#读取NGS_1.py产生的数据
df1 = pd.read_csv(File+"test1.txt",sep="\t")
#将聚类结果放入df1
df2 = pd.read_csv("test.cluster.txt",sep="\t")
df3 = pd.merge(df1,df2,on="sequence_id")
#
df3.cDNA_contig = df3.cDNA_contig.astype("int64")

#df3.DNA_count.dtype
df4 = df3.groupby(["v_call","j_call","cluster_id"]).cDNA_contig.sum() #xxxx
df4.name = "counts_of_contig_in_the_same_clonotype"
df5 = pd.merge(df3,df4,on=["v_call","j_call","cluster_id"])
df5["clonotype"] = df5["cluster_id"].map(lambda x:x.split(" ")[1])+"@"+df5["v_call"].map(lambda x:x.split("*")[0])+"@"+df5["j_call"].map(lambda x:x.split("*")[0])
#你可以发现cluster id相同但是V & J germline reference不一定相同
#但是此处的Cluster count 是根据V & J的germline reference相同,CDR3长度相同且相似度大于80%统计的

df6 = df5.groupby("cluster_id").cDNA_contig.sum()
df6.name = "counts_of_contig_in_the_same_cluster"
df6 = pd.merge(df5,df6,on="cluster_id") #
df6.cluster_id = df6.cluster_id.map(lambda x:x.split()[1])
#
df7 = df6.loc[:,["clonotype","counts_of_contig_in_the_same_clonotype","cluster_id","counts_of_contig_in_the_same_cluster","cdr3_aa","counts_of_contig_in_the_same_AA","cDNA_contig","v_call","j_call","sequence_alignment_aa","sequence"]]

#把代表性序列薅出来
df13 = pd.read_csv(File+".dictionary.txt",sep="\t",names=["cluster_id","representative_CDR3"])
df13["cluster_id"] = df13["cluster_id"].map(lambda x:x.split(' ')[1])
df7 = pd.merge(df13,df7,on="cluster_id")
#df13 = df13.loc[:,["cluster","representative_CDR3"]]

#希望按照counts_of_contig_in_the_same_cluster的大小进行cluster_id的编号
df7 = df7.sort_values("counts_of_contig_in_the_same_cluster",ascending=False)
x = pd.Series(list(df7.cluster_id.drop_duplicates(keep="first")))
y = pd.Series(list(range(1,len(x)+1)))
df8 = pd.concat([x,y],axis=1)
df8 = df8.rename(columns={0:"cluster_id",1:"cluster"})
df7 = pd.merge(df7,df8,on="cluster_id",how="left")
df7 = df7.loc[:,["clonotype","counts_of_contig_in_the_same_clonotype","cluster","counts_of_contig_in_the_same_cluster","cdr3_aa","counts_of_contig_in_the_same_AA","cDNA_contig","v_call","j_call","representative_CDR3","sequence_alignment_aa","sequence"]]
#总表制作完成
#ok,一会儿将代表性CDR3序列以及cluster编号更正到 File+".dictionary.txt"中

#然后还有需求,希望最后的分析表格中cluster对应的蛋白种类标记出来,#还是需要用到总表
#table of unique protein 
df9 = df7.sort_values(["counts_of_contig_in_the_same_AA","cDNA_contig"],ascending=False) 
df9 = df9.drop_duplicates(subset="sequence_alignment_aa",keep="first")   

#table of unique clonotype
df10 = df7.sort_values(["cDNA_contig","counts_of_contig_in_the_same_AA","counts_of_contig_in_the_same_cluster"],ascending=False)
df10 = df10.drop_duplicates(subset="clonotype", keep='first')   #72行
 
#table of unique cluster
df11 = df7.sort_values(["counts_of_contig_in_the_same_cluster","cDNA_contig","counts_of_contig_in_the_same_AA"],ascending=False)
df11 = df11.drop_duplicates(subset="cluster", keep='first')   #66行
#将代表性CDR3序列以及cluster编号更正到 File+".dictionary.txt"中
df11.loc[:,["cluster","representative_CDR3"]].to_csv(File+".dictionary.txt",sep="\t",index=False)


#开始最后的分析表格制作了
#table of analysis
df12 = df11.loc[:,["cluster","counts_of_contig_in_the_same_cluster","cdr3_aa","representative_CDR3"]]

#df12["frequency_of_total"] = df12.counts_of_contig_in_the_same_cluster/df3.cDNA_contig.sum()

df12["VH_frequency"] = df12.counts_of_contig_in_the_same_cluster/df12.counts_of_contig_in_the_same_cluster.sum()

df12["CDR3_length"] = df12.cdr3_aa.apply(lambda x:len(x))

#忘记了还要把蛋白种类提出来,WC
#用table of unique protein 制作这个数据
df16 = df9.groupby("cluster",as_index=False).agg("count")
df16 = df16.loc[:,["cluster","sequence_alignment_aa"]]
df16 = df16.rename(columns={"sequence_alignment_aa":"No_of_unique_VH_seq"})
df16.cluster = df16.cluster.astype("string")

df12.cluster = df12.cluster.astype("string")
df14 = pd.merge(df12,df16,on="cluster",how="inner")
#改一下名字
df14 = df14.rename(columns={"counts_of_contig_in_the_same_cluster":"No_of_VH_seq","representative_CDR3":"Representative_CDR3","cdr3_aa":"Most_abundant_CDR3"})
#按列排序
df14 = df14.loc[:,["cluster","No_of_unique_VH_seq","No_of_VH_seq","VH_frequency","Most_abundant_CDR3","CDR3_length","Representative_CDR3"]]

#将一些总结性的数据放在表头
df15 = pd.read_csv(File+".count.txt",sep="\t")
df15.iloc[2,1] = df3.cDNA_contig.sum()
df15.iloc[3,1] = df11.cluster.count()
df15.iloc[4,1] = df9.sequence_alignment_aa.count()

#写入文件
with pd.ExcelWriter(File+".result.xlsx") as writer:
    df7.to_excel(writer, sheet_name="all_of_contig",index=False)
    df9.to_excel(writer, sheet_name="table_of_unique_protein",index=False)
	df10.to_excel(writer, sheet_name="table_of_unique_clonotype",index=False)
	df11.to_excel(writer, sheet_name="table_of_unique_cluster",index=False)
	df14.to_excel(writer, sheet_name="table_of_analysis",index=False,startrow=8)
	df15.to_excel(writer,sheet_name="table_of_analysis",index=False,startrow=0)
bash NGS_round2.sh $(pwd) round1.txt round2.txt

#shell脚本
#!/usr/bin/bash

file=${2} #要处理的文件igblast结果
dir=${1} #文件所在目录
round1=${3} #对标文件,比如淘选第一轮产生的igblast结果

#过滤,然后丢掉protein对应contig 少于10的序列
/usr/bin/python3 /public/home/djs/huiyu/NGS_1.py ${dir} ${file}

/usr/bin/python3 /public/home/djs/huiyu/NGS_3.py ${dir} ${file} ${round1}

#新出现的CDR3聚类
awk 'BEGIN{FS="\t"} NR > 1 {print ">"$1;print $6}' ${dir}/Round2.recluster.txt > ${dir}/cluster.fa

~/software/cd-hit-v4.8.1-2019-0228/cd-hit -i ${dir}/cluster.fa -o ${dir}/cluster.test  -c 0.8 -g 1 -n 4 -d 0 -l 4 -S 0  -sc 1

cat ${dir}/cluster.test.clstr |sed 's/.*>/>/g'|sed 's/\..*//g' |tr -d '\n' | sed 's/>C/\nC/g' | awk 'BEGIN{FS=">"} {for(i=2;i<=NF;i++){print $1"\t"$i}}' > ${dir}/test.cluster.txt

sed -i '1i cluster_id\tsequence_id' ${dir}/test.cluster.txt

#制作一些统计数据
a=$(echo ${file} |cut -d "_" -f1)

#统计reads数目
echo -e "name\tcount" >> ${file}.count.txt   ##20210828修改
echo -e "reads\t$(cat ../../Fastqc/multiqc/multiqc_data/multiqc_fastqc.txt |grep "${a}" |cut -f 5 |uniq)" >> ${file}.count.txt

#统计contig
ls ../ | grep ${a} |grep -v "vector" | while read id ;do echo $id && awk '{if(NR%4 == 2) print $0}' ../$id | wc -l ;done > count.txt
awk 'NR==2 {print "contig\t" $0}' count.txt >> ${file}.count.txt

#统计VH contain contig
#这个在pandas中完成
echo -e "VHH_contain_contig\t3.14" >>  ${file}.count.txt  ##20210828修改


#统计cluster
#这个在pandas中完成
echo -e "CDR3_cluster\t1314" >>  ${file}.count.txt  ##20210828修改

#统计unique protein
#这个在pandas中完成
echo -e "unique_protein\t1314" >>  ${file}.count.txt

#可以加一些提示语,这样知道运行到那里了

/usr/bin/python3 /public/home/djs/huiyu/NGS_4.py ${dir} ${file} ${round1}
################################################################################################
#开始round2的分析
#python脚本1 : NGS_1.py
#!/usr/bin/python3
#coding=utf-8

import sys
import os
import numpy as np
import pandas as pd

Dir = sys.argv[1]
File = sys.argv[2]
os.chdir(Dir)
df1 = pd.read_csv(File,sep='\t')
#粗筛一下
df2 = df1[(df1["stop_codon"] == 'F') & (df1["vj_in_frame"] == 'T') & (df1["v_frameshift"] == 'F') & (df1["productive"] == 'T') & (df1["complete_vdj"] == 'T')]
#提取有效信息
df2 = df2.loc[:,["sequence_id","sequence","v_call","j_call","sequence_alignment_aa","cdr3_aa"]]
#有些序列可能没有CDR3,丢掉了
df2 = df2.dropna(axis=0)
#根据蛋白序列对应的contig,粗筛(counts of contig in the same unique protein > 10)
df2["cDNA_contig"] = df2.sequence_id.map(lambda x:x.split("+")[1])
df2.cDNA_contig = df2.cDNA_contig.astype("int64")
df3 = df2.groupby("sequence_alignment_aa").cDNA_contig.sum()
df3.name = "counts_of_contig_in_the_same_AA"
df4 = pd.merge(df2,df3,on="sequence_alignment_aa",how="left")  #大约10万行
df4 = df4.loc[df4.counts_of_contig_in_the_same_AA > 10] #大约3万行
#保存,然后根据CDR3序列进行聚类
df4.to_csv(File+"test1.txt",sep="\t",index=False)


##################################

#!/usr/bin/python3
#coding = utf-8
#NGS_3.py

import sys
import os
import numpy as np
import pandas as pd
import Levenshtein
#from subprocess import call


Dir = sys.argv[1]
File = sys.argv[2]
Round1 = sys.argv[3]
os.chdir(Dir)

df1 = pd.read_csv(File+"test1.txt",sep="\t")

#将Round1的CDR3序列读取进来
df2 = pd.read_csv(Round1+".dictionary.txt",sep="\t")
df2 = df2.rename(columns={"cluster":"cluster_id"})

list1 = list(df1.cdr3_aa) 
dictionary = dict(zip(list(df2.cluster_id),list(df2.representative_CDR3)))
x = [] #储存cluster

	
for i in list1:
	list3 = [1000000] #储存key
	list4 = [1000000] #储存value
	for key,value in dictionary.items():
		if len(i) == len(value) and Levenshtein.hamming(i,value)/len(i) <= 0.2:
			list3.append(key)
			list4.append(Levenshtein.hamming(i,value))
				
	index = list4.index(min(list4))
	x.append(list3[index])
	del list3
	del list4
	
#尝试将df1的cdr3与Round1的CDR3对标起来
df1["cluster_id"] = x
df3 = pd.merge(df1,df2,on="cluster_id",how="left")

#new_cluster如何编号?
#在python中搞不定,去shell中
df20 = df3.loc[df3.cluster_id == 1000000]
df21 = df3.loc[df3.cluster_id != 1000000]
df20.to_csv("Round2.recluster.txt",sep = "\t",index=False)
df21.to_csv("Round2.txt",sep = "\t",index=False)

#########################

#!/usr/bin/python3
#coding = utf-8
#NGS_4.py

import sys
import os
import numpy as np
import pandas as pd
import Levenshtein
#from subprocess import call


Dir = sys.argv[1]
File = sys.argv[2]
Round1 = sys.argv[3]
os.chdir(Dir)


df20 = pd.read_csv("Round2.recluster.txt",sep = "\t")
df21 = pd.read_csv("Round2.txt",sep = "\t")
df22 = pd.read_csv("test.cluster.txt",sep="\t")

#有点弯弯绕绕的,结果就是将round1新出线的CDR3给个clusterID然后制造成与df21结构一样的数据框
df20 = pd.merge(df20,df22,on="sequence_id",how="right")
df20.cluster_id_y = df20.cluster_id_y.map(lambda x:x.split()[1])
df20.cluster_id_y = df20.cluster_id_y.astype("int64")
df20["cluster"] = df20["cluster_id_x"] + df20["cluster_id_y"]
df20 = df20.loc[:,["sequence_id","sequence","v_call","j_call","sequence_alignment_aa","cdr3_aa","cDNA_contig","counts_of_contig_in_the_same_AA","cluster","representative_CDR3"]]
df20 = df20.rename(columns={"cluster":"cluster_id"})


#把round2新出现的CDR3聚类后添加到NGS_3.py产生的结果后面
df3 = pd.concat([df21,df20],ignore_index=True)
df3.cDNA_contig = df3.cDNA_contig.astype("int64")

#df3.DNA_count.dtype
df4 = df3.groupby(["v_call","j_call","cluster_id"]).cDNA_contig.sum() #xxxx
df4.name = "counts_of_contig_in_the_same_clonotype"
df5 = pd.merge(df3,df4,on=["v_call","j_call","cluster_id"])
df5.cluster_id = df5.cluster_id.astype("string")
df5["clonotype"] = df5["cluster_id"]+"@"+df5["v_call"].map(lambda x:x.split("*")[0])+"@"+df5["j_call"].map(lambda x:x.split("*")[0])
#
df6 = df5.groupby("cluster_id").cDNA_contig.sum()
df6.name = "counts_of_contig_in_the_same_cluster"
df6 = pd.merge(df5,df6,on="cluster_id") #

df7 = df6.loc[:,["clonotype","counts_of_contig_in_the_same_clonotype","cluster_id","counts_of_contig_in_the_same_cluster","cdr3_aa","representative_CDR3","counts_of_contig_in_the_same_AA","cDNA_contig","v_call","j_call","sequence_alignment_aa","sequence"]]
#这就是总表了啊
df7 = df7.sort_values("counts_of_contig_in_the_same_cluster",ascending=False)
df7.cluster_id = df7.cluster_id.astype("int64")
#还需要将新出现的cluster根据round1的cluster标号
df8 = pd.read_csv(Round1+".dictionary.txt",sep="\t",names=["cluster","represent"])

y = pd.Series(list(df7.cluster_id.drop_duplicates()))
x = pd.Series(list(df7.cluster_id.drop_duplicates()))
count = len(df8.cluster) + 1
num = 0
while num < len(x):
	if x[num] >= 1000000:
		x[num] = count
		count = count + 1
	num = num + 1
	

df8 = pd.concat([y,x],axis=1)
df8 = df8.rename(columns={0:"cluster_id",1:"cluster"})
df7 = pd.merge(df7,df8,on="cluster_id",how="left")
df7 = df7.loc[:,["clonotype","counts_of_contig_in_the_same_clonotype","cluster","counts_of_contig_in_the_same_cluster","cdr3_aa","representative_CDR3","counts_of_contig_in_the_same_AA","cDNA_contig","v_call","j_call","sequence_alignment_aa","sequence"]]
df7 = df7.rename(columns={"cluster":"cluster_id"})
#此时总表才完成

#table of unique protein 
df8 = df7.sort_values(["counts_of_contig_in_the_same_AA","cDNA_contig"],ascending=False) #
df8 = df8.drop_duplicates(subset="sequence_alignment_aa",keep="first")  #51816行

#还有一个挑战的问题,根据Round1标记Round2中新出现的protein   
#利用脚本参数指定
#此时应该与round1运行NGS_1.py后的数据进行对比

df_8 = pd.read_csv(Round1+"test1.txt",sep="\t")

#定义一个函数
def Protein(str1):
	for str2 in list(df_8.sequence_alignment_aa.drop_duplicates()):
		if str1 == str2:
			return False
		else:
			continue
	return True
				
df8["new_protein"] = df8.sequence_alignment_aa.apply(lambda str1:Protein(str1)) #超级限速步骤,在想着怎么优化


#table of unique clonotype
df10 = df7.sort_values(["cDNA_contig","counts_of_contig_in_the_same_AA","counts_of_contig_in_the_same_cluster"],ascending=False)
df10 = df10.drop_duplicates(subset="clonotype", keep='first')   #52行
 
#table of unique cluster
df11 = df7.sort_values(["counts_of_contig_in_the_same_cluster","cDNA_contig","counts_of_contig_in_the_same_AA"],ascending=False)
df11 = df11.drop_duplicates(subset="cluster_id", keep='first')   #47行


#开始最后的分析表格制作了,头大
#table of analysis
df12 = df11.loc[:,["cluster_id","counts_of_contig_in_the_same_cluster","cdr3_aa","representative_CDR3","cDNA_contig"]] 

#df12["frequency_of_total"] = df12.counts_of_contig_in_the_same_cluster/df3.cDNA_contig.sum()

df12["frequency"] = df12.counts_of_contig_in_the_same_cluster/df12.counts_of_contig_in_the_same_cluster.sum()

df12["CDR3_length"] = df12.cdr3_aa.apply(lambda x:len(x))

df12 = df12.loc[:,["cluster_id","counts_of_contig_in_the_same_cluster","frequency","cdr3_aa","representative_CDR3","CDR3_length","cDNA_contig"]]
df12.cluster_id = df12.cluster_id.astype("string")
#df12 = df12.sort_values("cluster_id")
#忘记了还要把蛋白种类提出来,WC
#用table of unique protein 制作这个数据
df16 = df8.groupby("cluster_id",as_index=False).agg("count")
df16 = df16.loc[:,["cluster_id","sequence_alignment_aa"]]
df16 = df16.rename(columns={"sequence_alignment_aa":"variety_of_protein"})
df16.cluster_id = df16.cluster_id.astype("string")
df12 = pd.merge(df12,df16,on="cluster_id",how="inner")

df14 = df12.rename(columns={"cluster_id":"cluster","counts_of_contig_in_the_same_cluster":"No_of_VH_seq","representative_CDR3":"Representative_CDR3","cdr3_aa":"Most_abundant_CDR3","frequency":"VH_frequency","variety_of_protein":"No_of_unique_VH_seq"})
#按列排序
df14 = df14.loc[:,["cluster","No_of_unique_VH_seq","No_of_VH_seq","VH_frequency","Most_abundant_CDR3","CDR3_length","Representative_CDR3"]]
df14["new_family"] = df14.Representative_CDR3.isna()
x = list(df14.new_family)
i = 0
while i < len(x):
	if x[i] == False:
		x[i] = ""
		i = i+1
	else:
		x[i] = "new"
		i = i+1
		
df14.new_family = x

#将一些总结性的数据放在表头
df15 = pd.read_csv(File+".count.txt",sep="\t")
df15.iloc[2,1] = df3.cDNA_contig.sum()
df15.iloc[3,1] = df11.cluster_id.count()
df15.iloc[4,1] = df8.sequence_alignment_aa.count()

#写入文件
with pd.ExcelWriter(File+".result.xlsx") as writer:
    df7.to_excel(writer, sheet_name="all_of_contig",index=False)
    df8.to_excel(writer, sheet_name="table_of_unique_protein",index=False)
	df10.to_excel(writer, sheet_name="table_of_unique_clonotype",index=False)
	df11.to_excel(writer, sheet_name="table_of_unique_cluster",index=False)
	df14.to_excel(writer, sheet_name="table_of_analysis",index=False,startrow=8)
	df15.to_excel(writer,sheet_name="table_of_analysis",index=False,startrow=0)

标签:测序,same,id,cluster,修改版,counts,高通量,txt,contig
来源: https://blog.csdn.net/jiangshandaiyou/article/details/120206579