-
Notifications
You must be signed in to change notification settings - Fork 0
/
GBIF_transform.r
111 lines (102 loc) · 4.64 KB
/
GBIF_transform.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# GBIF taxo conversion
GBIF <- read_excel("~/GitHub/tpt-acari/input/GBIF_acari.xlsx") # read in GBIF file
Ticks <- GBIF[which(GBIF$family == "Ixodidae"),] # ticks
df <- GBIF[which(GBIF$taxonID %!in% Ticks$taxonID),] # remove ticks
GBIF_origin <- df # keep original file for sanity check
tpt_dwc_template <- read_excel("input/tpt_dwc_template.xlsx") # read in TPT DarwinCore template
df <- rbindlist(list(df, tpt_dwc_template), fill = TRUE) # add all DwC columns
# ensure no NA in taxonomicStatus
for (i in 1:nrow(df)){
if (is.na(df$taxonomicStatus[i])) {
df$taxonomicStatus[i] <- "doubtful"
}
}
# add dataset for accepted taxa without one
for (i in 1:nrow(df)){
if (is.na(df$datasetID[i]) & df$taxonomicStatus[i] == "accepted") {
df$datasetID[i] <- "TPT"
}
}
# get accepted names
temp <- df[which(!is.na(df$acceptedNameUsageID)),] # get all rows with an accepted name ID
df <- df[which(is.na(df$acceptedNameUsageID)),] # remove rows with accepted name ID
for (i in 1:nrow(temp)){
temp$acceptedNameUsage[i] <- vlookup(df$canonicalName,temp$acceptedNameUsageID[i],df$taxonID) # get canonical name of matching taxon ID
}
df <- rbind(df,temp) # add back rows with accepted names
df_nodataset <- df[which(is.na(df$datasetID)),] # get taxa with no dataset ID
df_nodataset$reason <- "no dataset"
df <- df[which(!is.na(df$datasetID)),] # remove rows with no dataset ID
df_misapplied <- df[which(df$taxonomicStatus == "misapplied"),] # get taxa with taxonomic status of misapplied - what does that mean?
df_misapplied$reason <- "misapplied"
df <- df[which(df$taxonID %!in% df_misapplied$taxonID),] # remove rows with taxonomicStatus = misapplied - don't know how to treat
# check for duplicate names
df$reason <- c(ifelse(duplicated(df$scientificName, fromLast = TRUE) | duplicated(df$scientificName),
"duplicate", NA)) # Flag internal dupes
df_dupes_review <- df[which(grepl('duplicate',df$reason) == TRUE), ] # get duplicates for review
df <- df[which(grepl('duplicate',df$reason) == FALSE), ] # remove all dupes from working file
df_dupes_keep <- df_dupes_review[which(df_dupes_review$Dataset == "NEW"),]
df_dupes_keep <- df_dupes_keep[which(!duplicated(df_dupes_keep$canonicalName)),]
df_dupes <- df_dupes_review[which(df_dupes_review$taxonID %!in% df_dupes_keep$taxonID),]
df <- rbind(df, df_dupes_keep, fill=TRUE)
df_removed <- rbindlist(list(df_dupes, df_misapplied, df_nodataset), fill = TRUE)
df$source <- "GBIF"
# Do this after final review...
df_non_dwc <- subset(df, select = c(source, taxonID, Dataset, datasetID, genericName, reason)) # get all columns that are not DwC
# remove non DwC columns from working file
df$Dataset <- NULL
df$datasetID <- NULL
df$genericName <- NULL
df$reason <- NULL
# add subfamily column for consistency
df$subfamily <- NA
# order column names
# df[,c(1,2,3,4)]. Note the first comma means keep all the rows, and the 1,2,3,4 refers to the columns.
df <- df[,c("source",
"taxonID",
"scientificNameID",
"acceptedNameUsageID",
"parentNameUsageID",
"originalNameUsageID",
"nameAccordingToID",
"namePublishedInID",
"taxonConceptID",
"scientificName",
"acceptedNameUsage",
"parentNameUsage",
"originalNameUsage",
"nameAccordingTo",
"namePublishedIn",
"namePublishedInYear",
"higherClassification",
"kingdom",
"phylum",
"class",
"order",
"family",
"subfamily",
"genus",
"subgenus",
"specificEpithet",
"infraspecificEpithet",
"taxonRank",
"verbatimTaxonRank",
"scientificNameAuthorship",
"vernacularName",
"nomenclaturalCode",
"taxonomicStatus",
"nomenclaturalStatus",
"taxonRemarks",
"canonicalName"
)]
# sanity check
original <- nrow(GBIF_origin) # number of rows in cleaned file
final <- nrow(df) + nrow(df_removed) # number of rows in converted taxo file plus number of rows in higher taxa
if(original == final) {
write.csv(df,"~/GitHub/tpt-acari/output/GBIF_DwC.csv", row.names = FALSE) # write out transformed GBIF DwC
write.csv(df_removed,"~/GitHub/tpt-acari/output/GBIF_removed.csv", row.names = FALSE) # write out removed rows
write.csv(df_non_dwc,"~/GitHub/tpt-acari/output/GBIF_non_DwC.csv", row.names = FALSE) # write out removed rows
print("YAY")
} else {
print("rows are missing")
}