-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathR_name2taxid_GMPD.R
68 lines (58 loc) · 1.95 KB
/
R_name2taxid_GMPD.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
wd = getwd()
setwd(wd)
print(file.exists("../DATA/GMPD_main_2017-02-06.csv"))#two dots and slash to go up two levels
G = read.csv("../DATA/GMPD_main_2017-02-06.csv")
#don't need parasite taxonomy because going to use NCBI
# G_taxonomy = read.csv("../DATA/GMPD_parasite_taxonomy_2016-02-06.csv")
G_taxonomy = read.csv("../DATA/GMPD_parasite_taxonomy_2016-02-06.csv")
#subset G_taxonomy to require binomial name
G_taxonomy = subset(G_taxonomy, HasBinomialName=="yes")
G_traits = read.csv("../DATA/GMPD_parasite_traits_2016-12-01.csv")
dim(G)
# G = merge(G, G_traits)
dim(G)#this is smaller, so G_traits is not completely filled in, need to use method other than merge
true_false = G$ParasiteCorrectedName %in% G_traits$ParasiteCorrectedName
inds_true = which(true_false == TRUE)
inds_not = which(true_false == FALSE)
G = G[,c("HostCorrectedName",
"HostOrder",
"ParasiteCorrectedName")]
G_not = G[inds_not,]
G_to_merge =G[inds_true,]
G_to_merge = merge(G_to_merge, G_traits)
G_not$close=NA
G_not$nonclose=NA
G_not$vector=NA
G_not$intermediate=NA
G_not$ParasiteTraitsCitation=NA
G_combined = rbind(G_to_merge, G_not)
dim(G_combined)
G_taxonomy = G_taxonomy[,c("ParType",
"ParasiteCorrectedName")]
out = NULL
up = unique(G_combined$ParasiteCorrectedName)
for (a in 1:length(up)){#for each ParasiteCorrectedName
tmp = subset(G_combined, ParasiteCorrectedName == up[a])
tmp_tax = subset(G_taxonomy, ParasiteCorrectedName == up[a])
dim_tax = dim(tmp_tax)[1]
if (dim_tax>0){
tmp$ParType = tmp_tax$ParType[1]
out = rbind(out, tmp)
}
}
G_combined = out
dim(G_combined)
df= G_combined
#get only bacteria
df = subset(df, ParType == "Bacteria")
df = rename(df, pathogen = ParasiteCorrectedName)
df = subset(df, pathogen !="")
dim = dim(df)[1]
for (a in 1:dim){
id = name2taxid(df$pathogen[a], out_type = "summary")
taxid = id$tax_id[1]
df$tax_id[a] = taxid
}
G = df
print(dim(G)[1])
save(G, file = "../DATA/PROCESSED/G.Rdata")