VHH高通量测序数据分析---版本3

大家好，我是讯享网，很高兴认识大家。
#Round3与round2的对标 #就是希望根据round2的cluster id and representative_CDR3然后命名round3 #然后还需要将round2的unique protein 以及round3的unique protein对标出来 #/usr/bin/bash #coding=utf-8 #$1 是文件所在目录 #$2 是roudn2的文件 #$3 是round3的文件 dir=$1 round2=$2 round3=$3 #前期使用NGS_1.py已经粗略的筛选过了，文件为${round3}.test1.txt #此时需要将round2的cluster id and representative_CDR3 以及unique protein提取出来 /usr/bin/python3 /public/home/djs/huiyu/NGS_5.py ${dir} ${round3} ${round2} #新出现的CDR3聚类 awk 'BEGIN{FS="\t"} NR > 1 {print ">"$1;print $6}' ${dir}/Round2.recluster.txt > ${dir}/cluster.fa ~/software/cd-hit-v4.8.1-2019-0228/cd-hit -i ${dir}/cluster.fa -o ${dir}/cluster.test -c 0.8 -g 1 -n 4 -d 0 -l 4 -S 0 -sc 1 cat ${dir}/cluster.test.clstr |sed 's/.*>/>/g'|sed 's/\..*//g' |tr -d '\n' | sed 's/>C/\nC/g' | awk 'BEGIN{FS=">"} {for(i=2;i<=NF;i++){print $1"\t"$i}}' > ${dir}/test.cluster.txt sed -i '1i cluster_id\tsequence_id' ${dir}/test.cluster.txt  #新出现的CDR3聚类后进行后续分析 /usr/bin/python3 /public/home/djs/huiyu/NGS_6.py ${dir} ${round3} ${round2} #!/usr/bin/python3 #coding = utf-8 #NGS_5.py import sys import os import numpy as np import pandas as pd import Levenshtein #from subprocess import call Dir = sys.argv[1] Round3 = sys.argv[2] Round2 = sys.argv[3] os.chdir(Dir) #将round2的cluster id and representative_CDR3保存到df1 #df1 = pd.read_excel("NB259-h-R2-1_S9_L001_R1_001.fastq.gz.trim_1P.fastq.gz.merge.out.trim_vector.sort.sort.sort.txt.result.xlsx",sheet_name="table_of_unique_cluster",engine="openpyxl") df1 = pd.read_csv(Round2+".result.xlsx",sheet_name="table_of_unique_cluster",engine="openpyxl") df1 = df1.loc[:,["cluster_id","cdr3_aa"]] #将Round3过滤后的序列读取进来 #df2 = pd.read_csv("NB259-C-R3-1_S11_L001_R1_001.fastq.gz.trim_1P.fastq.gz.merge.out.trim_vector.sort.sort.sort.txttest1.txt",sep="\t") df2 = pd.read_csv(Round3+"test1.txt",sep="\t") df2 = df2.rename(columns={ 
   "cluster":"cluster_id"}) #根据round2的信息命名round3的cluster list1 = list(df2.cdr3_aa) dictionary = dict(zip(list(df1.cluster_id),list(df1.cdr3_aa))) x = [] #储存cluster for i in list1: list3 = [] #储存key list4 = [] #储存value for key,value in dictionary.items(): if len(i) == len(value) and Levenshtein.hamming(i,value)/len(i) <= 0.2: list3.append(key) list4.append(Levenshtein.hamming(i,value)) index = list4.index(min(list4)) x.append(list3[index]) del list3 del list4 #尝试将df3的cdr3与Round2的CDR3对标起来 df2["cluster_id"] = x df3 = pd.merge(df2,df1,on="cluster_id",how="left") #new_cluster如何编号？ #在python中搞不定，去shell中 df20 = df3.loc[df3.cluster_id == ] df21 = df3.loc[df3.cluster_id != ] df20.to_csv("Round2.recluster.txt",sep = "\t",index=False) df21.to_csv("Round2.txt",sep = "\t",index=False)  #!/usr/bin/python3 #coding = utf-8 #NGS_6.py import sys import os import numpy as np import pandas as pd import Levenshtein #from subprocess import call Dir = sys.argv[1] Round3 = sys.argv[2] Round2 = sys.argv[3] os.chdir(Dir) df20 = pd.read_csv("Round2.recluster.txt",sep = "\t") df21 = pd.read_csv("Round2.txt",sep = "\t") df22 = pd.read_csv("test.cluster.txt",sep="\t") #有点弯弯绕绕的，结果就是将round1新出线的CDR3给个clusterID然后制造成与df21结构一样的数据框 df20 = pd.merge(df20,df22,on="sequence_id",how="right") df20.cluster_id_y = df20.cluster_id_y.map(lambda x:x.split()[1]) df20.cluster_id_y = df20.cluster_id_y.astype("int64") df20["cluster"] = df20["cluster_id_x"] + df20["cluster_id_y"] df20 = df20.loc[:,["sequence_id","sequence","v_call","j_call","sequence_alignment_aa","cdr3_aa_x","cDNA_contig","counts_of_contig_in_the_same_AA","cluster","cdr3_aa_y"]] df20 = df20.rename(columns={ 
   "cluster":"cluster_id"}) #把round3新出现的CDR3聚类后添加到NGS_5.py产生的结果后面 df3 = pd.concat([df21,df20],ignore_index=True) df3.cDNA_contig = df3.cDNA_contig.astype("int64") df3 = df3.rename(columns={ 
   "cdr3_aa_x":"cdr3_aa","cdr3_aa_y":"representative_CDR3"}) #df3.DNA_count.dtype df4 = df3.groupby(["v_call","j_call","cluster_id"]).cDNA_contig.sum() #xxxx df4.name = "counts_of_contig_in_the_same_clonotype" df5 = pd.merge(df3,df4,on=["v_call","j_call","cluster_id"]) df5.cluster_id = df5.cluster_id.astype("string") df5["clonotype"] = df5["cluster_id"]+"@"+df5["v_call"].map(lambda x:x.split("*")[0])+"@"+df5["j_call"].map(lambda x:x.split("*")[0]) # df6 = df5.groupby("cluster_id").cDNA_contig.sum() df6.name = "counts_of_contig_in_the_same_cluster" df6 = pd.merge(df5,df6,on="cluster_id") # df7 = df6.loc[:,["clonotype","counts_of_contig_in_the_same_clonotype","cluster_id","counts_of_contig_in_the_same_cluster","cdr3_aa","representative_CDR3","counts_of_contig_in_the_same_AA","cDNA_contig","v_call","j_call","sequence_alignment_aa","sequence"]] #这就是总表了啊 df7 = df7.sort_values("counts_of_contig_in_the_same_cluster",ascending=False) df7.cluster_id = df7.cluster_id.astype("int64") #还需要将新出现的cluster根据round2的cluster标号,即round3中出现了大于的cluster id #df8 = pd.read_excel("NB259-h-R2-1_S9_L001_R1_001.fastq.gz.trim_1P.fastq.gz.merge.out.trim_vector.sort.sort.sort.txt.result.xlsx",sheet_name="table_of_unique_cluster",engine="openpyxl") df8 = pd.read_excel(Round2+".result.xlsx",sheet_name="table_of_unique_cluster",engine="openpyxl") df8 = df8.loc[:,["cluster_id","cdr3_aa"]] #df8 = pd.read_csv(Round1+".dictionary.txt",sep="\t",names=["cluster","represent"]) y = pd.Series(list(df7.cluster_id.drop_duplicates())) x = pd.Series(list(df7.cluster_id.drop_duplicates())) count = max(df8.cluster_id) + 1 num = 0 while num < len(x): if x[num] >= : x[num] = count count = count + 1 num = num + 1 df8 = pd.concat([y,x],axis=1) df8 = df8.rename(columns={ 
   0:"cluster_id",1:"cluster"}) df7 = pd.merge(df7,df8,on="cluster_id",how="left") df7 = df7.loc[:,["clonotype","counts_of_contig_in_the_same_clonotype","cluster","counts_of_contig_in_the_same_cluster","cdr3_aa","representative_CDR3","counts_of_contig_in_the_same_AA","cDNA_contig","v_call","j_call","sequence_alignment_aa","sequence"]] df7 = df7.rename(columns={ 
   "cluster":"cluster_id"}) #此时总表才完成  #table of unique protein  df8 = df7.sort_values(["counts_of_contig_in_the_same_AA","cDNA_contig"],ascending=False) # df8 = df8.drop_duplicates(subset="sequence_alignment_aa",keep="first") #51816行 #还有一个挑战的问题，根据Round2标记Round3中新出现的protein  #此时应该与round2运行NGS_1.py后的数据进行对比 #df_8 = pd.read_csv("NB259-h-R2-1_S9_L001_R1_001.fastq.gz.trim_1P.fastq.gz.merge.out.trim_vector.sort.sort.sort.txttest1.txt",sep="\t") df_8 = pd.read_csv(Round2+"test1.txt",sep="\t") #定义一个函数 def Protein(str1): for str2 in list(df_8.sequence_alignment_aa.drop_duplicates()): if str1 == str2: return False else: continue return True df8["new_protein"] = df8.sequence_alignment_aa.apply(lambda str1:Protein(str1)) #超级限速步骤，在想着怎么优化 #table of unique clonotype df10 = df7.sort_values(["cDNA_contig","counts_of_contig_in_the_same_AA","counts_of_contig_in_the_same_cluster"],ascending=False) df10 = df10.drop_duplicates(subset="clonotype", keep='first') #52行 #table of unique cluster df11 = df7.sort_values(["counts_of_contig_in_the_same_cluster","cDNA_contig","counts_of_contig_in_the_same_AA"],ascending=False) df11 = df11.drop_duplicates(subset="cluster_id", keep='first') #47行 #开始最后的分析表格制作了，头大 #table of analysis df12 = df11.loc[:,["cluster_id","counts_of_contig_in_the_same_cluster","cdr3_aa","representative_CDR3","cDNA_contig"]] #df12["frequency_of_total"] = df12.counts_of_contig_in_the_same_cluster/df3.cDNA_contig.sum() df12["frequency"] = df12.counts_of_contig_in_the_same_cluster/df12.counts_of_contig_in_the_same_cluster.sum() df12["CDR3_length"] = df12.cdr3_aa.apply(lambda x:len(x)) df12 = df12.loc[:,["cluster_id","counts_of_contig_in_the_same_cluster","frequency","cdr3_aa","representative_CDR3","CDR3_length","cDNA_contig"]] df12.cluster_id = df12.cluster_id.astype("string") #df12 = df12.sort_values("cluster_id") #忘记了还要把蛋白种类提出来，WC #用table of unique protein 制作这个数据 df16 = df8.groupby("cluster_id",as_index=False).agg("count") df16 = df16.loc[:,["cluster_id","sequence_alignment_aa"]] df16 = df16.rename(columns={ 
   "sequence_alignment_aa":"variety_of_protein"}) df16.cluster_id = df16.cluster_id.astype("string") df12 = pd.merge(df12,df16,on="cluster_id",how="inner") df14 = df12.rename(columns={ 
   "cluster_id":"cluster","counts_of_contig_in_the_same_cluster":"No_of_VH_seq","representative_CDR3":"Representative_CDR3","cdr3_aa":"Most_abundant_CDR3","frequency":"VH_frequency","variety_of_protein":"No_of_unique_VH_seq"}) #按列排序 df14 = df14.loc[:,["cluster","No_of_unique_VH_seq","No_of_VH_seq","VH_frequency","Most_abundant_CDR3","CDR3_length","Representative_CDR3"]] #标记new family df14["new_family"] = df14.Representative_CDR3.isna() x = list(df14.new_family) i = 0 while i < len(x): if x[i] == False: x[i] = "" i = i+1 else: x[i] = "new" i = i+1 df14.new_family = x #将一些总结性的数据放在表头 df15 = pd.read_csv(Round3+".count.txt",sep="\t") df15.iloc[2,1] = df3.cDNA_contig.sum() df15.iloc[3,1] = df11.cluster_id.count() df15.iloc[4,1] = df8.sequence_alignment_aa.count() #写入文件 with pd.ExcelWriter(Round3+".vs_Round2.xlsx") as writer: df7.to_excel(writer, sheet_name="all_of_contig",index=False) df8.to_excel(writer, sheet_name="table_of_unique_protein",index=False) df10.to_excel(writer, sheet_name="table_of_unique_clonotype",index=False) df11.to_excel(writer, sheet_name="table_of_unique_cluster",index=False) df14.to_excel(writer, sheet_name="table_of_analysis",index=False,startrow=8) df15.to_excel(writer,sheet_name="table_of_analysis",index=False,startrow=0)
讯享网
VHH高通量测序数据分析---版本3

相关推荐