diff --git a/scripts/Malavi Project.Rmd b/scripts/Malavi Project.Rmd new file mode 100644 index 0000000..00573f2 --- /dev/null +++ b/scripts/Malavi Project.Rmd @@ -0,0 +1,567 @@ +--- +title: "Malavi Mapping Group Project" +author: "CEID Disease Mapping Working Group" +date: "`r format(Sys.time(), '%d %B, %Y')`" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + + +```{r libLoad, message = F} +library(tidyverse) +library(magrittr) +library(sf) +library(measurements) +library(tmap) +library(spData) +library(here) +``` + +```{r getData} +#set correct path and uncomment next line to get ecoregions +ecoReg <- sf::st_read("Terrestrial_Ecoregions.shp") +q <- readr::read_tsv(here("MalAvi_9794263.csv")) +q <- subset(q, q$Host_Environment == "Wild") +ecoReg <- sf::st_transform(ecoReg, st_crs(world)) +``` + +```{r manip} +#reduce to records with lat/long +q %<>% drop_na(coordinates) + +#turn coords column to lat long +q %<>% mutate(lat = stringr::str_replace_all(coordinates,"\\,.*"," "), + lon = stringr::str_replace_all(coordinates,".*,","")) + +q %<>% mutate(lat = stringr::str_replace_all(lat," ",""), + lon = stringr::str_replace_all(lon," ","")) +q %<>% mutate(lat = stringr::str_replace_all(lat,"°"," "), + lon = stringr::str_replace_all(lon,"°"," ")) +q %<>% mutate(lat = stringr::str_replace_all(lat,"º"," "), + lon = stringr::str_replace_all(lon,"º"," ")) +q %<>% mutate(lat = stringr::str_replace_all(lat,"'",""), + lon = stringr::str_replace_all(lon,"'","")) +q %<>% mutate(lat = stringr::str_replace_all(lat,"`",""), + lon = stringr::str_replace_all(lon,"`","")) +q %<>% mutate(lat = stringr::str_replace_all(lat,"´",""), + lon = stringr::str_replace_all(lon,"´","")) +q %<>% mutate(lat = stringr::str_replace_all(lat,"′",""), + lon = stringr::str_replace_all(lon,"′","")) +q %<>% mutate(lat = stringr::str_replace_all(lat,"’",""), + lon = stringr::str_replace_all(lon,"’","")) + +q %<>% mutate(lat = measurements::conv_unit(lat,from='deg_dec_min',to='dec_deg'), + lon = measurements::conv_unit(lon,from='deg_dec_min',to='dec_deg')) + + +q %<>% mutate(lat=as.numeric(lat), + lon=as.numeric(lon)) +``` + + + +```{r make sf} +#q %<>% dplyr::filter(`#no`!=2642) #wrong coords +q %<>% dplyr::filter(lat>-300) #remove weird points with very low lat + +pnts_sf <- st_as_sf(q, coords = c('lon', 'lat'), crs = st_crs(world)) #make into sf object +sf_use_s2(F) #spherical geometry OFF; not sure why we need to do this +``` + +```{r basic map} +tm_shape(pnts_sf) + tm_dots() + tm_shape(world) + tm_borders() #simple map +``` + +```{r assign regions} +#make new sf with columns indicating which ecoregion each point is in +pnts <- pnts_sf %>% mutate( + intersection=as.integer(st_intersects(geometry, ecoReg)), + ecoRegion=if_else(is.na(intersection),'',ecoReg$ECO_NAME[intersection]), + mht=if_else(is.na(intersection),'',ecoReg$WWF_MHTNAM[intersection]), +) +``` + +```{r color points by ecoregion} +tm_shape(world) + tm_borders() + + tm_shape(pnts) + tm_dots(col = as.factor("ecoRegion"), size = 0.1, legend.show = F) + + tmap_options(max.categories = n_distinct(pnts$ecoRegion)) +``` + +```{r missing points} +#save map object of missing points for ecoregion +missing_pts_tm <- pnts %>% filter(ecoRegion == "") %>% tm_shape(.) + tm_dots() #NA's were previously labeled as "" (i.e. nothing) +tm_shape(world) + tm_borders() + missing_pts_tm +``` + +```{r color points by mht} +tm_shape(world) + tm_borders() + + tm_shape(pnts) + tm_dots(col = as.factor("mht"), size = 0.1, legend.show = F) + + tmap_options(max.categories = n_distinct(pnts$mht)) +``` + +```{r save RDS} +#save RDS of malavi points data with assigned ecoregions +#uncomment next line if you need to save this again +#saveRDS(pnts, file = here("data","processed_data","points.rds")) +``` + + + +```{r data-summary} +ep <- pnts %>% dplyr::select(ecoRegion, Lineage_Name) %>% distinct() +ep <- ep %>% + filter(ecoRegion != "") +ep <- as.data.frame(ep) +ep <- ep[c("ecoRegion", "Lineage_Name")] +unique_eco.regions <- length(unique(pnts$ecoRegion)) #247 +unique_lineages <- length(unique(ep$Lineage_Name)) #3011 +unique_bird.sp <- length(unique(pnts$species[pnts$ecoRegion != ""])) #1552 +unique_bird.fam <- length(unique(pnts$family[pnts$ecoRegion != ""])) #112 +unique_bird.ord <- length(unique(pnts$order[pnts$ecoRegion != ""])) #26 + +Bird.sp <- unique(pnts$species[pnts$ecoRegion != ""]) +str(Bird.sp) +write.csv(Bird.sp, "Bird.sp.csv") +``` + + + +```{r Parasite Jaccard Index between Ecoregions (Pairwise)} +### Jaccard index per Ecoregion (pairwise comparison) ### +library(dplyr) + +# Jaccard function +jaccard <- function(a, b) { + intersection = length(intersect(a,b)) + union = length(a) + length(b) - intersection + return (intersection/union) +} + +myecoR <- ep %>% dplyr::select(ecoRegion) %>% distinct() %>% pull() +myecoR <- as.data.frame(myecoR) + +myJaccard <- tibble(ecoregion1=character(0),ecoregion2=character(0),jaccard=numeric(0)) + +for (i in 1:length(myecoR$myecoR)){ + for (j in i:length(myecoR$myecoR)){ + if (i!=j){ + ecoRA <- myecoR$myecoR[i] + ecoRB <- myecoR$myecoR[j] + ep_A <- ep %>% dplyr::filter(ecoRegion == ecoRA) %>% dplyr::select(Lineage_Name) %>% pull() + ep_B <- ep %>% dplyr::filter(ecoRegion == ecoRB) %>% dplyr::select(Lineage_Name) %>% pull() + jaccardValue <- jaccard(ep_A,ep_B) + myJaccard %<>% add_case(ecoregion1=ecoRA,ecoregion2=ecoRB,jaccard=jaccardValue) + } + } +} + +# Saving malaria lineages Jaccard index +write_csv(myJaccard, "Malavi.Parasite.Jaccard.csv") +``` + +```{r Parasite cJaccard Index between Ecoregions (Pairwise)} +# Corrected Jaccard function +cJaccard <- function(a, b) { + intersection <- length(intersect(a, b)) + possible <- min(length(a), length(b)) + return (intersection / possible) +} + +# Extracting distinct ecoregions +myecoR <- ep %>% dplyr::select(ecoRegion) %>% distinct() %>% pull() +myecoR <- as.data.frame(myecoR) + +# Empty tibble for storing results +myCorrectedJaccard <- tibble(ecoregion1 = character(0), + ecoregion2 = character(0), + cJaccard = numeric(0)) + +# Loop through the ecoregions to get the corrected Jaccard index +for (i in 1:length(myecoR$myecoR)) { + for (j in i:length(myecoR$myecoR)) { + if (i != j) { + ecoRA <- myecoR$myecoR[i] + ecoRB <- myecoR$myecoR[j] + + # Get the lineagees for each ecoregion + ep_A <- ep %>% dplyr::filter(ecoRegion == ecoRA) %>% dplyr::select(Lineage_Name) %>% pull() + ep_B <- ep %>% dplyr::filter(ecoRegion == ecoRB) %>% dplyr::select(Lineage_Name) %>% pull() + + # Calculate the corrected Jaccard index + cJaccardValue <- cJaccard(ep_A, ep_B) + + myCorrectedJaccard %<>% add_case(ecoregion1 = ecoRA, + ecoregion2 = ecoRB, + cJaccard = cJaccardValue) + } + } +} + +write.csv(myCorrectedJaccard, "Malavi.parasite.cjaccard.csv") +``` + + +```{r Interection Birds-Ecoregions} +# Here I'm assigning birds with ecoregions +# Packages needed +library(tidyverse) +library(magrittr) +library(sf) +library(rgdal) + +# Downloading data +# Ecoregion data +ecoReg <- sf::st_read("Terrestrial_Ecoregions.shp") + +# Bird distribution data (subset with our species of interest from Birdlife) +birdDis <- sf::st_read(dsn = "Bird_Dist_Malavi 1.gpkg") + +ecoReg <- sf::st_make_valid(ecoReg) +sf::sf_use_s2(FALSE) + + +uniqueBirds <- unique(birdDis$sci_name) +ecoReg <- st_transform(ecoReg, crs = st_crs(birdDis)) + +ecoReg <- ecoReg %>% + filter(ECO_NAME %in% pnts$ecoRegion) +ecoregion_names <- unique(ecoReg$ECO_NAME) + +# Empty matrix to store our results +result_matrix <- matrix(0, nrow = length(ecoregion_names), ncol = length(uniqueBirds), + dimnames = list(ecoregion_names, uniqueBirds)) + + +# Loop through each combination of species and ecoregions +for (i in length(uniqueBirds)) + species <- uniqueBirds[i] + species_polygon <- birdDis[birdDis$sci_name == species, ] + + for (j in 1:length(ecoregion_names)) { + if (!j %in% c(473,475)){ + ecoregion <- ecoregion_names[j] + ecoregion_polygon <- ecoReg[ecoReg$ECO_NAME == ecoregion, ] + + # Check for intersection + if (nrow(st_intersection(species_polygon, ecoregion_polygon)) > 0) { + result_matrix[j, i] <- 1 + } + } + } + +# Just checking +idx1 <- which(result_matrix[,1]==1) +idx2 <- which(result_matrix[,2]==1) +idx4 <- which(result_matrix[,4]==1) + +plot(ecoReg$geometry[idx1]) + +# write.csv(result_matrix, "result_matrix.csv") +result_matrix <- read.csv("result_matrix.csv") + +# Print the result matrix +print(result_matrix) +``` + + +```{r Bird (all) Jaccard Index per Ecoregion (Pairwise)} +# Jaccard function: +jaccard <- function(a, b) { + intersection = sum(a & b) + union = sum(a | b) + return(intersection / union) +} + +# Extracting ecoregion names +ecoregions <- result_matrix$X + +# Selecting only the columns with species data +result_a_matrix <- result_matrix[, -1] + +# Creating an empty dataframe to store our results +jaccard_df <- data.frame(ecoregion_A = character(), + ecoregion_B = character(), + jaccard_index = numeric(), + stringsAsFactors = FALSE) + +# Calculating Jaccard index for pairwise ecoregion comparisons (of all bird species) +for (i in 1:(nrow(result_a_matrix) - 1)) { + for (j in (i + 1):nrow(result_a_matrix)) { + ecoregion_A <- ecoregions[i] + ecoregion_B <- ecoregions[j] + species_A <- result_a_matrix[i, ] + species_B <- result_a_matrix[j, ] + jaccard_value <- jaccard(species_A, species_B) + # Adding the result to the dataframe + jaccard_df <- rbind(jaccard_df, data.frame(ecoregion_A = ecoregion_A, + ecoregion_B = ecoregion_B, + jaccard_index = jaccard_value)) + } +} + +write.csv(jaccard_df, "Malavi.bird.jaccard.csv") +``` + + +```{r Merging Jaccard Indexes per Ecoregion (Pairwise)} +# Renaming columns in myJaccard to match the column names in results_df +myJaccard <- myJaccard %>% + rename(Ecoregion1 = ecoregion1, Ecoregion2 = ecoregion2) + +# Merging dataframes using dplyr's left_join +merged_df <- left_join(result_df, myJaccard, by = c("Ecoregion1", "Ecoregion2")) + +merged_df <- merged_df[!duplicated(t(apply(merged_df[c("Ecoregion1", "Ecoregion2")], 1, sort))), ] + +merged_df <- merged_df %>% + rename(Jaccard.Birds = JaccardIndex, Jaccard.Malaria = jaccard) + +# Print the resulting data frame +print(merged_df) +``` + + +```{r Classifying migratory vs resident birds} +# Here Im assigning each bird species to one of the 2 categories: Resident or Migratory +# For this I'm using the metadata of the polygons (distribution) + +# Downloading the data +polygon.data <- read.csv("Bird.Polygons.Malavi.csv") + +# Remove rows with season value of 5 (metadata) +polygon.data <- polygon.data[polygon.data$seasonal != 5, ] + +# Function to classify species. +classify_species <- function(polygon.data) { + result <- aggregate(seasonal ~ sci_name, polygon.data, function(x) { + if (all(x == 1)) { + "resident" + } else { + "migratory" + } + }) + return(result) +} + +# Applying the classification +species_classification <- classify_species(polygon.data) +print(species_classification) +``` + + +```{r Bird (migratory) Jaccard Index per Ecoregion (Pairwise)} +# Replace dots with spaces in column names to make them match +colnames(result_matrix) <- gsub(".", " ", colnames(result_matrix), fixed = TRUE) + +# Filtering the dataframe to include only Migratory species +migratory_species <- species_classification[species_classification$seasonal == "migratory", "sci_name"] + +# Subset the dataframe based on migratory species +result_matrix_m <- result_matrix[, c("X", names(result_matrix)[names(result_matrix) %in% migratory_species])] + +# Extracting ecoregion names +ecoregions <- result_matrix_m$X + +# Selecting only the columns with species data +result_m_matrix <- result_matrix_m[, -1] + +# Creating an empty dataframe to store the results +jaccard_m_df <- data.frame(ecoregion_A = character(), + ecoregion_B = character(), + jaccard_index = numeric(), + stringsAsFactors = FALSE) + +# Calculating Jaccard index for pairwise ecoregions +for (i in 1:(nrow(result_m_matrix) - 1)) { + for (j in (i + 1):nrow(result_m_matrix)) { + ecoregion_A <- ecoregions[i] + ecoregion_B <- ecoregions[j] + species_A <- result_m_matrix[i, ] + species_B <- result_m_matrix[j, ] + jaccard_value <- jaccard(species_A, species_B) + # Adding the result to the dataframe + jaccard_m_df <- rbind(jaccard_m_df, data.frame(ecoregion_A = ecoregion_A, + ecoregion_B = ecoregion_B, + jaccard_index = jaccard_value)) + } +} + +# Saving the results +write.csv(jaccard_m_df, "Malavi.M.bird.jaccard.csv") +``` + + +```{r Bird (resident) Jaccard Index per Ecoregion (Pairwise)} +# Filter the dataframe to include only resident species +# Replace dots with spaces in column names +colnames(result_matrix) <- gsub(".", " ", colnames(result_matrix), fixed = TRUE) + +resident_species <- species_classification[species_classification$seasonal == "resident", "sci_name"] + +# Subset the dataframe based on resident species +result_matrix_r <- result_matrix[, c("X", names(result_matrix)[names(result_matrix) %in% resident_species])] + +# Extracting ecoregion names +ecoregions <- result_matrix_r$X + +# Selecting only the columns with species data +result_r_matrix <- result_matrix_r[, -1] + +# Creating an empty dataframe to store results +jaccard_r_df <- data.frame(ecoregion_A = character(), + ecoregion_B = character(), + jaccard_index = numeric(), + stringsAsFactors = FALSE) + +# Calculating Jaccard index for pairwise ecoregions +for (i in 1:(nrow(result_r_matrix) - 1)) { + for (j in (i + 1):nrow(result_r_matrix)) { + ecoregion_A <- ecoregions[i] + ecoregion_B <- ecoregions[j] + species_A <- result_r_matrix[i, ] + species_B <- result_r_matrix[j, ] + jaccard_value <- jaccard(species_A, species_B) + # Adding the result to the dataframe + jaccard_r_df <- rbind(jaccard_r_df, data.frame(ecoregion_A = ecoregion_A, + ecoregion_B = ecoregion_B, + jaccard_index = jaccard_value)) + } +} + +# Saving our results +write.csv(jaccard_r_df, "Malavi.R.bird.jaccard.csv") +``` + +```{r Bird (all) Corrected Jaccard Index per Ecoregion (Pairwise)} +# Estimating the corrected jaccard index of all bird species +# Corrected Jaccard function +cJaccard <- function(a, b){ + intersection <- sum(a & b) + possible <- min(sum(a), sum(b)) + return(intersection / possible) +} + +# Extracting ecoregion names +ecoregions <- result_matrix$X + +# Selecting only the columns with species data +result_a_matrix <- result_matrix[, -1] + +# Creating an empty dataframe to store results +cJaccard_df <- data.frame(ecoregion_A = character(), + ecoregion_B = character(), + cJaccard_index = numeric(), + stringsAsFactors = FALSE) + +# Calculating Corrected Jaccard index for pairwise ecoregion comparisons +for (i in 1:(nrow(result_a_matrix) - 1)) { + for (j in (i + 1):nrow(result_a_matrix)) { + ecoregion_A <- ecoregions[i] + ecoregion_B <- ecoregions[j] + species_A <- result_a_matrix[i, ] + species_B <- result_a_matrix[j, ] + cJaccard_value <- cJaccard(species_A, species_B) + # Adding the result to the dataframe + cJaccard_df <- rbind(cJaccard_df, data.frame(ecoregion_A = ecoregion_A, + ecoregion_B = ecoregion_B, + cJaccard_index = cJaccard_value)) + } +} + +print(cJaccard_df) + +# Saving the results +write.csv(cJaccard_df, "Malavi.bird.cjaccard.csv") +``` + + +```{r Bird (migratory) Corrected Jaccard Index per Ecoregion (Pairwise)} +# Estimating the corrected jaccard index of migratory species +# Subset the dataframe based on migratory species +result_matrix_m <- result_matrix[, c("X", names(result_matrix)[names(result_matrix) %in% migratory_species])] + +# Extracting ecoregion names +ecoregions <- result_matrix_m$X + +# Selecting only the columns with species data +result_m_matrix <- result_matrix_m[, -1] + +# Creating an empty dataframe to store results +cJaccard_m_df <- data.frame(ecoregion_A = character(), + ecoregion_B = character(), + cJaccard_index = numeric(), + stringsAsFactors = FALSE) + +# Corrected Jaccard function (again just in case - we can skip this) +cJaccard <- function(a, b){ + intersection <- sum(a & b) + possible <- min(sum(a), sum(b)) + return(intersection / possible) +} + +# Calculating Corrected Jaccard index for pairwise ecoregion comparisons (migratory) +for (i in 1:(nrow(result_m_matrix) - 1)) { + for (j in (i + 1):nrow(result_m_matrix)) { + ecoregion_A <- ecoregions[i] + ecoregion_B <- ecoregions[j] + species_A <- result_m_matrix[i, ] + species_B <- result_m_matrix[j, ] + cJaccard_value <- cJaccard(species_A, species_B) + # Adding the result to the dataframe + cJaccard_m_df <- rbind(cJaccard_m_df, data.frame(ecoregion_A = ecoregion_A, + ecoregion_B = ecoregion_B, + cJaccard_index = cJaccard_value)) + } +} + +# Writing the corrected Jaccard index dataframe to a CSV file +write.csv(cJaccard_m_df, "Malavi.M.bird.cJaccard.csv") +2 +``` + +```{r Bird (resident) Corrected Jaccard Index per Ecoregion (Pairwise)} +# Estimating the corrected jaccard index of resident species +# Subset the dataframe based on resident species +result_matrix_r <- result_matrix[, c("X", names(result_matrix)[names(result_matrix) %in% resident_species])] + +# Extracting ecoregion names +ecoregions <- result_matrix_r$X + +# Selecting only the columns with species data +result_r_matrix <- result_matrix_r[, -1] + +# Creating an empty dataframe to store results +cJaccard_r_df <- data.frame(ecoregion_A = character(), + ecoregion_B = character(), + cJaccard_index = numeric(), + stringsAsFactors = FALSE) + +# Corrected Jaccard function +cJaccard <- function(a, b){ + intersection <- sum(a & b) + possible <- min(sum(a), sum(b)) + return(intersection / possible) +} + +# Calculating Corrected Jaccard index for pairwise ecoregion comparisons (residents) +for (i in 1:(nrow(result_r_matrix) - 1)) { + for (j in (i + 1):nrow(result_r_matrix)) { + ecoregion_A <- ecoregions[i] + ecoregion_B <- ecoregions[j] + species_A <- result_r_matrix[i, ] + species_B <- result_r_matrix[j, ] + cJaccard_value <- cJaccard(species_A, species_B) + # Adding the result to the dataframe + cJaccard_r_df <- rbind(cJaccard_r_df, data.frame(ecoregion_A = ecoregion_A, + ecoregion_B = ecoregion_B, + cJaccard_index = cJaccard_value)) + } +} + +# Writing the corrected Jaccard index dataframe to a CSV file +write.csv(cJaccard_r_df, "Malavi.R.bird.cJaccard.csv") +``` +