--- title: "Measles Virus B3" subtitle: "Phylogenetic Topology in Final Presentation with Clade Relationships, Operation Allies Welcome" author: "Andrew Beck, NCIRD/DVD/VVPDB" date: "May 21, 2021" output: html_document: toc: true toc_float: true toc_depth: 5 number_sections: true params: TREEFILE_WGS: # Path to Maximum Likelihood tree BEASTFILE_WGS: # Path to WGS Bayesian tree BEASTFILE_N450: # Path to N450 Bayesian tree METADATA_TABLE: # Path to metadata table SUBSTITUTIONS_TABLE: # Path to substitutions table WGS_ALIGNMENT: # Path to WGS sequence alignment N450_ALIGNMENT: # Path to N450 Bayesian tree --- ```{r setup, include=FALSE, echo = FALSE, warning=FALSE, message=FALSE} library(ggtree) library(ggsci) library(pastecs) library(tidyverse) library(tidytree) library(ggplot2) library(ggpattern) library(knitr) library(phytools) library(RColorBrewer) library(kableExtra) library(treeio) library(scales) library(plotly) library(aplot) library(purrr) library(Biostrings) library(DiagrammeR) library(lubridate) library(tidyquant) library(stringr) library(tracerer) library(ggrepel) library(cowplot) library(ggprism) library(gtable) library(grid) library(ape) library(ggthemes) library(magick) library(magrittr) library(dplyr) knitr::opts_chunk$set(echo = FALSE, warning = FALSE, global.par = TRUE, message = FALSE, cache = FALSE, root.dir =' C:/Users/lpu5/R_Projects/PHYLOGENETICS_MF/12032020_NY_WGS_FINAL') ``` ```{r margins, include=FALSE, echo = FALSE} par(mar=c(0,0,0,0)) ``` ```{r adaptive_scale_process, echo=FALSE, cache=FALSE} # This will handle the interpolated scales. Courtesy https://nanx.me/blog/post/ggplot2-color-interpolation/ # Courtesy https://jianfengwu2022.github.io/ # @param values Color values. pal_ramp <- function(values) { force(values) function(n) { if (n <= length(values)) { values[seq_len(n)] } else { colorRampPalette(values, alpha = TRUE)(n) } } } # Adaptive color palette generator # # Adaptive color palette generator for ggsci color palettes using `pal_ramp()`. # # @param name Color palette name in ggsci # @param palette Color palette type in ggsci # @param alpha Transparency level, a real number in (0, 1]. # # @details See `names(ggsci:::ggsci_db)` for all color palette names in ggsci. # See `names(ggsci:::ggsci_db$"pal")` for available palette types under # the palette `pal`. pal_adaptive <- function(name, palette, alpha = 1) { if (alpha > 1L | alpha <= 0L) stop("alpha must be in (0, 1]") raw_cols <- ggsci:::ggsci_db[[name]][[palette]] raw_cols_rgb <- col2rgb(raw_cols) alpha_cols <- rgb( raw_cols_rgb[1L, ], raw_cols_rgb[2L, ], raw_cols_rgb[3L, ], alpha = alpha * 255L, names = names(raw_cols), maxColorValue = 255L ) pal_ramp(unname(alpha_cols)) } #Adaptive color scales # # @inheritParams pal_adaptive # @param ... additional parameters for [ggplot2::discrete_scale()]. scale_color_adaptive <- function(name, palette, alpha = 1, ...) { ggplot2::discrete_scale("colour", name, pal_adaptive(name, palette, alpha), ...) } scale_fill_adaptive <- function(name, palette, alpha = 1, ...) { ggplot2::discrete_scale("fill", name, pal_adaptive(name, palette, alpha), ...) } ``` ```{r metadata_ingest, echo=FALSE, cache=FALSE} tipcategories = read.table(params$METADATA_TABLE, sep = "\t", #col.names = c("TAXON","IS_OAW","WHO_NAME","COUNTRY","STATE", "EPI_WEEK","YEAR"), header = TRUE, comment.char = "", stringsAsFactors = FALSE) ``` ```{r tree_ingest, echo=FALSE, warning=FALSE} # Ingest the maximum likelihood tree, WGS TREE_WGS_ML <- read.iqtree(file = params$TREEFILE_WGS) TREE_WGS_ML <- root(as.phylo(TREE_WGS_ML), outgroup = "MVs_Manchester.GBR_7.12_2_B3", resolve.root=TRUE) TREE_WGS_ML <- drop.tip(TREE_WGS_ML, "MVs_NewJersey.USA_8.19") TREE_WGS_ML <- ggtree(TREE_WGS_ML) %<+% tipcategories # Ingest the BEAST tree, WGS TREE_BEAST_WGS <- read.beast(file = params$BEASTFILE_WGS) TREE_BEAST_WGS <- ggtree(TREE_BEAST_WGS, mrsd="2021-10-07") %<+% tipcategories # Ingest the BEAST tree, N450 TREE_BEAST_N450 <- read.beast(file = params$BEASTFILE_N450) TREE_BEAST_N450 <- ggtree(TREE_BEAST_N450, mrsd="2021-10-07") %<+% tipcategories # Extract time of internal nodes from N450 BEAST tree, display selected node numbers with dates. my_tibble_N450 <-TREE_BEAST_N450 %>% as.treedata() %>% as.tibble %>% mutate(height_corrected = decimal_date(ymd("2021-10-07")) - height) %>% mutate(height_95_corrected_first = map_dbl(height_0.95_HPD, first)) %>% mutate(height_95_corrected_first = decimal_date(ymd("2021-10-07")) - height_95_corrected_first) %>% mutate(height_95_corrected_last = map_dbl(height_0.95_HPD, last)) %>% mutate(height_95_corrected_last = decimal_date(ymd("2021-10-07")) - height_95_corrected_last) %>% filter(node %in% c(17,167,168)) %>% select(node,height_corrected,height_95_corrected_first,height_95_corrected_last) kable(my_tibble_N450) # Extract time of internal nodes from WGS BEAST tree, display selected node numbers with dates. my_tibble_WGS <-TREE_BEAST_WGS %>% as.treedata() %>% as.tibble %>% mutate(height_corrected = decimal_date(ymd("2021-10-07")) - height) %>% mutate(height_95_corrected_first = map_dbl(height_0.95_HPD, first)) %>% mutate(height_95_corrected_first = decimal_date(ymd("2021-10-07")) - height_95_corrected_first) %>% mutate(height_95_corrected_last = map_dbl(height_0.95_HPD, last)) %>% mutate(height_95_corrected_last = decimal_date(ymd("2021-10-07")) - height_95_corrected_last) %>% filter(node %in% c(15,145,146)) %>% select(node,height_corrected,height_95_corrected_first,height_95_corrected_last) kable(my_tibble_WGS) ``` ## Metadata ### Purpose Measles virus WGS was performed on 44 specimens from OAW evacuee cases. 41 were successfully sequenced, and these are analyzed phylogenetically alongside several others from previous B3 outbreaks. 119 in total. ### Metadata Characteristics and Validation Number of samples in metadata table: **`r nrow(tipcategories)`** Maximum year in the metadata table: **`r max(tipcategories$YEAR)`** Maximum decimal year in the metadata table, branch reference: **`r decimal_date(ymd("2021-10-07"))`** Minimum year in the metadata table: **`r min(tipcategories$YEAR)`** ```{r TABLE_METADATA_SOURCE} kable(tipcategories, align = "c", caption = "All metadata attached to trees displayed below.")%>% kable_styling("hover", full_width = F, fixed_thead = T) %>% row_spec(which(tipcategories$IS_IMPORT == TRUE), background = "#FFCCCC") %>% scroll_box(width = "100%", height = "300px") ``` ### Maximum Likelihood Trees, not used in study ## Tree used for stub-out of annotations, only. ```{r ML_WGS, fig.height=25,fig.width=28, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE} # Feed it a ggtree with joined data, returns a list of the tree labels (meant to follow a filter) TREE_WGS_ML + # If it's from the OAW study, label red geom_tiplab(aes(label=WHO_NAME, subset = ISOAW == "TRUE"), color = "red", size=4.5) + # Everything else is black geom_tiplab(aes(label=WHO_NAME, subset = ISOAW != "TRUE"), color = "black", size=4.5)+ geom_nodelab(aes(label=node), color="black")+ xlim(0, .03) ``` ## Whole B3 Tree with clade labels ```{r ML_WGS_whole, fig.height=25,fig.width=28, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE} # Feed it a ggtree with joined data, returns a list of the tree labels (meant to follow a filter) p <-TREE_WGS_ML + # If it's from the OAW study, label red geom_tiplab(aes(label=WHO_NAME, subset = ISOAW == "TRUE"), color = "red", size=4.5) + # Everything else is black geom_tiplab(aes(label=WHO_NAME, subset = ISOAW != "TRUE"), color = "black", size=4.5)+ #Label reasonable clusters. geom_nodepoint(aes(subset=(as.numeric(label) >= 50)), size = 3, color = "black", fill = "grey", alpha = .5)+ geom_cladelab(node=122,label="OAW Entire",offset=.006,barsize=2,fontsize=8)+ geom_cladelab(node=152,label="Italy 2017",offset=.0035,barsize=2,fontsize=8)+ geom_cladelab(node=172,label="California 2019",offset=.0035,barsize=2,fontsize=8)+ geom_cladelab(node=145,label="A",offset=.0035,barsize=2,fontsize=8)+ geom_cladelab(node=126,label="B",offset=.0040,barsize=2,fontsize=8)+ geom_cladelab(node=125,label="C",offset=.0045,barsize=2,fontsize=8)+ geom_cladelab(node=124,label="D",offset=.005,barsize=2,fontsize=8)+ xlim(0, .03) p ``` ## Partial Maximum Likelihood Clade This is identical to the "full" B3 tree, only the subtree is extracted for the current study sequences. ```{r ML_WGS_whole_clade, fig.height=10,fig.width=28, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE} viewClade(p, 122) + geom_tiplab(aes(label=WHO_NAME), color = "black", size=4.5) ``` ## Bayesian Phylogeny WGS Tree was calculated in BEAST v.2.7. Chain was 4x50 million steps, with 10 percent burnin and 1:10000 sampling from chain (18000 trees in total sample). Tree is annotated as maximum clade credibility with mean branch lengths (time). 1. Models were compared and tested using IQtree v.2. Best support was TIM+G4+F (empirical frequencies) for concatenated coding regions, and TIM3+G4+F (empirical base frequencies) for concatenated noncoding regions. 2. Substitution Models are unlinked, meaning that the model controlling NCR and CDS is free to vary independently. 3. Clock Models are unlinked, NCR = strict, CDS = strict (linear). 4. Tree model is linked between partitions, Bayesian Skyline. ### Bayesian WGS with node numbers referenced Node numbers are assigned by the R package and are not scientifically meaningful. ```{r BEAST_Nodereference, fig.height=14,fig.width=16, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE} p <- TREE_BEAST_WGS+ # If the sequence is from the OAW study, label as red. geom_tiplab(aes(label=WHO_NAME, subset = ISOAW == "TRUE"), color = "red", size=3) + # Everything else is black. geom_tiplab(aes(label=WHO_NAME, subset = ISOAW != "TRUE"), color = "black", size=3)+ theme_tree2() + scale_x_continuous( breaks = seq(2015,2022,1),labels = seq(2015,2022,1))+xlim_tree(2023) p ``` ### Bayesian WGS with posterior support 1. Bars show 95% HPD for the inferred **time in years** of the node. 2. Points show nodes with > 0.5 posterior support in the sample. ```{r BEAST_WHOLE, fig.height=14,fig.width=16, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE} p <- TREE_BEAST_WGS+ # Everything else is black #geom_tiplab(aes(label=WHO_NAME), # color = "black", size=3)+ geom_hilight(node=145,fill = "pink", alpha = .5)+ geom_nodepoint(aes(subset=(as.numeric(posterior) >= .5)), color = "black", size = 5, fill="grey",alpha=0.5)+ theme_tree2() + xlim_tree(2022) p ``` ### Partial tree, with majority nodes shown if posterior > 0.9 This is a blow-up of the Bayesian subtree used for data presentation. ```{r BEAST_PARTIAL, fig.height=14,fig.width=16, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE} p <- TREE_BEAST_WGS+ # Everything else is black geom_tiplab(aes(label=WHO_NAME),time_scale=TRUE, color = "black", size=5)+ geom_nodepoint(aes(subset=(as.numeric(posterior) >= .9)), color = "black", size = 5)+ theme_tree2() + scale_x_continuous( breaks = seq(2015,2022,1),labels = seq(2015,2022,1))+xlim_tree(2023) viewClade(p, 145) ``` ### Partial B3 OAW subtree tree, containing metadata annotations Notes: 1. This is a **partial** tree, graphically extracted from the larger B3 set that was modeled alongside. 2. Red arrows are highly confident transmission pairs, grey are less confident or hypothesized. 3. Some of the less confident transmissions (35-24, 35-47) cover considerable evolutionary distance (sum up all connecting branch lengths in years). These are highly implausible. 4. Confidence in the groupings/tree topology is a posterior probability, with black dots for nodes > .9. This means **very** high confidence in the split position in the tree, or the presence of the group bearing that common ancestor. ```{r BEAST_PARTIAL_meta, fig.height=14,fig.width=18, fig.align="center", message=FALSE, echo=FALSE, warning= TRUE, dpi=300} insetTree <-TREE_BEAST_WGS+ geom_hilight(node=145,fill = "pink", alpha = .5)+ theme_void() + xlim_tree(2022) p <- TREE_BEAST_WGS+ geom_tiplab(aes(label=paste(WHO_NAME,"--",CASE)), color = "black", time_scale=TRUE, align = TRUE, size=5)+ geom_range("height_0.95_HPD", color='blue', size=2, alpha=.5) + geom_nodepoint(aes(subset=(as.numeric(posterior) >= .90)), color = "black", size = 5)+ theme_tree2() + scale_x_continuous(breaks = seq(2014,2022,2),labels = seq(2014,2022,2))+xlim_tree(2030)+ geom_vline(xintercept=c(2014.157,2016.553),linetype='dashed')+ theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1)) mainTreeWGS <-viewClade(p,145) + #High confidence Case 1->4 geom_taxalink(taxa1='MVs_Wisconsin.USA_35.21', taxa2='MVs_Wisconsin.USA_37.21_4', color='red', alpha = .5, linetype = 'solid', size = 2, curvature = 1, arrow=arrow(length=unit(0.02, "npc")))+ #High confidence Case 3->8 geom_taxalink(taxa1='MVs_Wisconsin.USA_37.21_2', taxa2='MVs_Wisconsin.USA_39.21', color='red', alpha = .5, linetype = 'solid', size = 2, curvature = 1, arrow=arrow(length=unit(0.02, "npc")))+ #Lower confidence Case 35->1 geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Wisconsin.USA_35.21', color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5, arrow=arrow(length=unit(0.02, "npc")))+ #Lower confidence Case 35->24 geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_35.21', color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5, arrow=arrow(length=unit(0.02, "npc")))+ #Lower confidence Case 35->47 geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21_3', color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5, arrow=arrow(length=unit(0.02, "npc")))+ #Lower confidence Case 35->25 geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21_2', color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5, arrow=arrow(length=unit(0.02, "npc")))+ #Lower confidence Case 35->31 geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21', color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5, arrow=arrow(length=unit(0.02, "npc")))+ theme(axis.text.x = element_text(size = 14, face = "bold")) tipcategories_filter <- tipcategories %>% filter(IS_SUBTREE == TRUE) %>% filter(WGS_EXISTS == TRUE) baseTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+ geom_tile(aes(fill = factor(BASE)),color = 'black') + scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Base"))+ scale_fill_adaptive(name = "nejm", palette = "default")+ theme_tree2()+ theme(axis.text.x = element_text(size = 14, face = "bold"))+ guides(fill = guide_legend(title = "Base")) clusterTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+ geom_tile(aes(fill = factor(CLUSTER)),color = 'black') + scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Main\nCluster"))+ scale_fill_adaptive(name = "nejm", palette = "default")+ theme_tree2()+ theme(axis.text.x = element_text(size = 14, face = "bold"))+ guides(fill = guide_legend(title = "Main Cluster")) importationTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+ geom_tile(aes(fill = factor(IMPORT)),color = 'black') + scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Importation\nStatus"))+ scale_fill_adaptive(name = "nejm", palette = "default")+ theme_tree2()+ theme(axis.text.x = element_text(size = 14, face = "bold"))+ guides(fill = guide_legend(title = "Importation Status")) barrackPlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+ geom_tile(aes(fill = factor(BARRACK_BAY_MULTIPLE)),color = 'black') + scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Barrack/\nBay"))+ scale_fill_adaptive(name = "nejm", palette = "default")+ theme_tree2()+ theme(axis.text.x = element_text(size = 14, face = "bold"))+ guides(fill = guide_legend(title = "Barrack/Bay")) flightPlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+ geom_tile(aes(fill = factor(FLIGHTS_WITH_MULTIPLE_WGS)),color = 'black') + scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Shared\nFlight"))+ scale_fill_adaptive(name = "nejm", palette = "default")+ theme_tree2()+ theme(axis.text.x = element_text(size = 14, face = "bold"))+ guides(fill = guide_legend(title = "Shared Flight")) familyTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+ geom_tile(aes(fill = factor(FAMILY_GROUP)),color = 'black') + scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Family\nGroup"))+ scale_fill_adaptive(name = "nejm", palette = "default")+ theme_tree2()+ theme(axis.text.x = element_text(size = 14, face = "bold"))+ guides(fill = guide_legend(title = "Family Group")) # order - Tree, base, cluster, subcluster, Family Group familyTilePlot %>% insert_left(barrackPlot,) %>% insert_left(importationTilePlot,) %>% insert_left(flightPlot,) %>% insert_left(baseTilePlot,) %>% insert_left(clusterTilePlot,) %>% insert_left(mainTreeWGS,5) ``` ## Bayesian Phylogeny N450 ### Bayesian N450 with posterior support for node date Bars show 95% HPD for the inferred **time in years** of the OAW common ancestor node. ```{r BEAST_WHOLE_N450, fig.height=14,fig.width=16, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE} p <- TREE_BEAST_N450+ geom_hilight(node=145,fill = "pink", alpha = .5)+ geom_vline(xintercept=c(2019.883,2021.190),linetype='dashed')+ theme_tree2() + xlim_tree(2022) p ``` ### N450 Supplement Tree, Bayesian ```{r BEAST_PARTIAL_meta_sub_N450, fig.height=14,fig.width=18, fig.align="center", message=FALSE, echo=FALSE, warning= TRUE} p <- TREE_BEAST_N450+ # Everything else is black geom_tiplab(aes(label=paste(WHO_NAME,"--",CASE)), color = "black", time_scale=TRUE, align = TRUE, size=5)+ geom_tippoint(aes(subset=(N450_MATCHES_WGS == "SANGER_ONLY")), color = "purple", fill = "purple", size = 5, alpha = .5, shape = 24)+ geom_range("height_0.95_HPD", color='blue', size=2, alpha=.5) + geom_nodepoint(aes(subset=(as.numeric(posterior) >= .90)), color = "black", size = 5)+ geom_vline(xintercept=c(2019.883,2021.190),linetype='dashed')+ theme_tree2() + scale_x_continuous( breaks = seq(2020,2022,1),labels = seq(2020,2022,1))+xlim_tree(2024) mainTreeN450 <-viewClade(p,167) + #High confidence Case 1->4 geom_taxalink(taxa1='MVs_Wisconsin.USA_35.21', taxa2='MVs_Wisconsin.USA_37.21_4', color='red', alpha = .5, linetype = 'solid', size = 2, curvature = -1, arrow=arrow(length=unit(0.02, "npc")))+ #High confidence Case 3->8 geom_taxalink(taxa1='MVs_Wisconsin.USA_37.21_2', taxa2='MVs_Wisconsin.USA_39.21', color='red', alpha = .5, linetype = 'solid', size = 2, curvature = -1, arrow=arrow(length=unit(0.02, "npc")))+ #Lower confidence Case 35->1 geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Wisconsin.USA_35.21', color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5, arrow=arrow(length=unit(0.02, "npc")))+ #Lower confidence Case 35->24 geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_35.21', color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5, arrow=arrow(length=unit(0.02, "npc")))+ #Lower confidence Case 35->47 geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21_3', color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5, arrow=arrow(length=unit(0.02, "npc")))+ #Lower confidence Case 35->25 geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21_2', color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5, arrow=arrow(length=unit(0.02, "npc")))+ #Lower confidence Case 35->31 geom_taxalink(taxa1='MVs_Virginia.USA_39.21', taxa2='MVs_Virginia.USA_36.21', color='grey', alpha = .7, linetype = 'solid', size = 2, curvature = .5, arrow=arrow(length=unit(0.02, "npc")))+ theme(axis.text.x = element_text(size = 14, face = "bold")) tipcategories_filter <- tipcategories %>% filter(IS_SUBTREE == TRUE) %>% filter(N450_EXISTS == TRUE) baseTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+ geom_tile(aes(fill = factor(BASE)),color = 'black') + scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Base"))+ scale_fill_adaptive(name = "nejm", palette = "default")+ theme_tree2()+ theme(axis.text.x = element_text(size = 14, face = "bold"))+ guides(fill = guide_legend(title = "Base")) clusterTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+ geom_tile(aes(fill = factor(CLUSTER)),color = 'black') + scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Main\nCluster"))+ scale_fill_adaptive(name = "nejm", palette = "default")+ theme_tree2()+ theme(axis.text.x = element_text(size = 14, face = "bold"))+ guides(fill = guide_legend(title = "Main Cluster")) importationTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+ geom_tile(aes(fill = factor(IMPORT)),color = 'black') + scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Importation\nStatus"))+ scale_fill_adaptive(name = "nejm", palette = "default")+ theme_tree2()+ theme(axis.text.x = element_text(size = 14, face = "bold"))+ guides(fill = guide_legend(title = "Importation Status")) barrackPlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+ geom_tile(aes(fill = factor(BARRACK_BAY_MULTIPLE)),color = 'black') + scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Barrack/\nBay"))+ scale_fill_adaptive(name = "nejm", palette = "default")+ theme_tree2()+ theme(axis.text.x = element_text(size = 14, face = "bold"))+ guides(fill = guide_legend(title = "Barrack/Bay")) flightPlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+ geom_tile(aes(fill = factor(FLIGHTS_WITH_MULTIPLE)),color = 'black') + scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Shared\nFlight"))+ scale_fill_adaptive(name = "nejm", palette = "default")+ theme_tree2()+ theme(axis.text.x = element_text(size = 14, face = "bold"))+ guides(fill = guide_legend(title = "Shared Flight")) familyTilePlot <- ggplot(data = tipcategories_filter, aes(as.numeric(as.character(BASE_FACTOR)),TAXON))+ geom_tile(aes(fill = factor(FAMILY_GROUP)),color = 'black') + scale_x_continuous(breaks = c(1), limits = c(.5,1.5), labels = c("Family\nGroup"))+ scale_fill_adaptive(name = "nejm", palette = "default")+ theme_tree2()+ theme(axis.text.x = element_text(size = 14, face = "bold"))+ guides(fill = guide_legend(title = "Family Group")) # order - Tree, base, cluster, subcluster, Family Group familyTilePlot %>% insert_left(barrackPlot,) %>% insert_left(importationTilePlot,) %>% insert_left(flightPlot,) %>% insert_left(baseTilePlot,) %>% insert_left(clusterTilePlot,) %>% insert_left(mainTreeN450,4) ``` ### Combined comparison tree of WGS and N450 ```{r WGS_N450_COMBINED, fig.height=12,fig.width=14, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE} plot_grid(mainTreeN450, mainTreeWGS, labels = c('A', 'B'), label_size = 20) ``` ### Base Distance Counts ```{r HAMMING_MATRIX_SETUP, fig.height=4,fig.width=8.5, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE} ### These are the functions needed for assembly and join-up of the base distance calculation tables alignment_WGS <- readDNAMultipleAlignment(params$WGS_ALIGNMENT, format = "fasta") alignment_N450 <- readDNAMultipleAlignment(params$N450_ALIGNMENT, format = "fasta") #TODO: narrow it to get the N450 from the inclusive range 1126-1575 counts_matrix_WGS <-stringDist(unmasked(alignment_WGS), method = "hamming", ignoreCase = FALSE, diag = FALSE, upper = FALSE) counts_matrix_N450 <-stringDist(unmasked(alignment_N450), method = "hamming", ignoreCase = FALSE, diag = FALSE, upper = FALSE) #select(TAXON, index of row that = min(CASE)) # Select the taxon and the counts_matrix_WGS <-as.matrix(counts_matrix_WGS) %>% as.tibble(rownames = "CASE_NOMENCLATURE") counts_matrix_N450 <-as.matrix(counts_matrix_N450) %>% as.tibble(rownames = "CASE_NOMENCLATURE") ## Coerce the tree metadata to a tibble, filter out only the OAW cases transform_single_result <-function(filterexpression, count_matrix) { metadata_OAW_ONLY <- as.tibble(tipcategories) %>% filter(ISOAW == TRUE) %>% filter(eval(parse(text = filterexpression))) %>% select(TAXON, CASE, FAMILY_GROUP) %>% mutate(LOWEST_CASE = min(CASE)) %>% mutate(FIRST_TAXON = .[CASE == min(CASE),"TAXON"]) %>% select(TAXON, CASE, FIRST_TAXON) %>% left_join(count_matrix, by = c("TAXON" = "CASE_NOMENCLATURE")) FIRST_TAXON_SELECTOR <-metadata_OAW_ONLY %>% distinct(FIRST_TAXON) %>% pluck(1,1) #Make fake tibble with all case numbers. joiner_tibble_cases <- tibble(CASE = 1:47) %>% mutate(CASE = str_c("CASE_", CASE)) finalselectrenamed <-metadata_OAW_ONLY %>% select(TAXON, CASE, FIRST_TAXON, all_of(FIRST_TAXON_SELECTOR)) %>% dplyr::rename(SUBSTITUTIONS_FROM_FIRST=4) %>% mutate(grouping1 = str_replace_all(filterexpression, "\"", "_")) %>% mutate(grouping2 = str_replace_all(grouping1, "\"", "_")) %>% mutate(grouping3 = str_replace_all(grouping2, " == ", "_")) %>% mutate(GROUPING = str_replace_all(grouping3, "__", "_")) %>% mutate(CASE = str_c("CASE_", CASE)) %>% select(-c(grouping1, grouping2, grouping3)) finalselectjoined <-left_join(joiner_tibble_cases, finalselectrenamed, by = "CASE") %>% pivot_wider(names_from = CASE, values_from = SUBSTITUTIONS_FROM_FIRST) %>% filter (!is.na(GROUPING)) %>% select(-c(TAXON, FIRST_TAXON)) %>% group_by(GROUPING) %>% summarize_all(., ~ ifelse(any(., !is.na(.)), sum(., na.rm=TRUE), NA)) %>% ungroup return(finalselectjoined) } ``` ### The substitution distance for WGS ```{r HAMMING_MATRIX_WGS, fig.height=4,fig.width=8.5, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE} list_exposure_groups_WGs <-list("FAMILY_GROUP == \"A\"", "FAMILY_GROUP == \"B\"", "FAMILY_GROUP == \"C\"", "FLIGHTS_WITH_MULTIPLE == \"E\"", "FLIGHTS_WITH_MULTIPLE == \"F\"", "FLIGHTS_WITH_MULTIPLE == \"G\"", "FLIGHTS_WITH_MULTIPLE == \"M\"", "FLIGHTS_WITH_MULTIPLE == \"O\"", "FLIGHTS_WITH_MULTIPLE == \"R\"", "IMPORT == \"International_Importation\"", "IMPORT == \"US-Acquired\"", "BARRACK_BAY_MULTIPLE == \"1725\"", "BARRACK_BAY_MULTIPLE == \"1740\"", "BARRACK_BAY_MULTIPLE == \"Upshur Bay 2 (West Side)\"", "CASE %in% c(1,4)", "CASE %in% c(3,8)", "CASE %in% c(1,35)", "CASE %in% c(24,35)", "CASE %in% c(25,35)", "CASE %in% c(31,35)", "CASE %in% c(47,35)" ) tibble_list_WGS <-lapply(list_exposure_groups_WGs, transform_single_result, count_matrix = counts_matrix_WGS) rbound_tibble_list_WGS <-bind_rows(tibble_list_WGS) rbound_tibble_list_WGS #kable(rbound_tibble_list_WGS, align = "c", caption = "Row-Bound Substitutions WGS.")%>% # kable_styling("hover", full_width = F, fixed_thead = T) %>% #row_spec(which(tipcategories$IS_IMPORT == TRUE), background = "#FFCCCC") %>% # scroll_box(width = "100%", height = "300px") write_tsv(rbound_tibble_list_WGS, "WGS_SUBSTITUTIONS.tsv") ``` ### The substitution distance for N450 ```{r HAMMING_MATRIX_N450, fig.height=4,fig.width=8.5, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE} list_exposure_groups_N450 <-list("FAMILY_GROUP == \"A\"", "FAMILY_GROUP == \"B\"", "FAMILY_GROUP == \"C\"", "FLIGHTS_WITH_MULTIPLE == \"E\"", "FLIGHTS_WITH_MULTIPLE == \"F\"", "FLIGHTS_WITH_MULTIPLE == \"G\"", "FLIGHTS_WITH_MULTIPLE == \"M\"", "FLIGHTS_WITH_MULTIPLE == \"O\"", "FLIGHTS_WITH_MULTIPLE == \"R\"", "IMPORT == \"International_Importation\"", "IMPORT == \"US-Acquired\"", "BARRACK_BAY_MULTIPLE == \"1725\"", "BARRACK_BAY_MULTIPLE == \"1740\"", "BARRACK_BAY_MULTIPLE == \"Upshur Bay 2 (West Side)\"", "CASE %in% c(1,4)", "CASE %in% c(3,8)", "CASE %in% c(30,28)", "CASE %in% c(1,35)", "CASE %in% c(24,35)", "CASE %in% c(25,35)", "CASE %in% c(30,35)", "CASE %in% c(31,35)", "CASE %in% c(47,35)" ) counts_matrix_N450 tibble_list_N450 <-lapply(list_exposure_groups_N450, transform_single_result, count_matrix = counts_matrix_N450) rbound_tibble_list_N450 <-bind_rows(tibble_list_N450) rbound_tibble_list_N450 #kable(rbound_tibble_list_N450, align = "c", caption = "Row-Bound Substitutions N450.")%>% # kable_styling("hover", full_width = F, fixed_thead = T) %>% #row_spec(which(tipcategories$IS_IMPORT == TRUE), background = "#FFCCCC") %>% # scroll_box(width = "100%", height = "300px") write_tsv(rbound_tibble_list_N450, "N450_SUBSTITUTIONS.tsv") ``` ### The substitution distances for N450 and WGS, for select reported cases. ```{r HAMMING_MATRIX_N450, fig.height=4,fig.width=8.5, fig.align="center", message=FALSE, echo=FALSE, warning= FALSE} # Case 3-8 cat('# Base Count Differences Case 3->8 `', '`\n') cat('# WGS (N-L Span) `', '`\n') counts_matrix_WGS["MVs_Wisconsin.USA_37.21_2","MVs_Wisconsin.USA_39.21"] cat('# N450 `', '`\n') counts_matrix_N450["MVs_Wisconsin.USA_37.21_2", "MVs_Wisconsin.USA_39.21"] # Case 1-4 cat('# Base Count Differences Case 1->4 `', '`\n') cat('# WGS (N-L Span) `', '`\n') counts_matrix_WGS["MVs_Wisconsin.USA_35.21", "MVs_Wisconsin.USA_37.21_4"] cat('# N450 `', '`\n') counts_matrix_N450["MVs_Wisconsin.USA_35.21", "MVs_Wisconsin.USA_37.21_4"] # Case 30-28 # Not in the matrix. # Case Case35 -> Case1 cat('# Base Count Differences Case 35->1 `', '`\n') cat('# WGS (N-L Span) `', '`\n') counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Wisconsin.USA_35.21"] cat('# N450 `', '`\n') counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Wisconsin.USA_35.21"] # Case Case35 -> Case24 cat('# Base Count Differences Case 35->24 `', '`\n') cat('# WGS (N-L Span) `', '`\n') counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_35.21"] cat('# N450 `', '`\n') counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_35.21"] # Case Case35 -> Case25 cat('# Base Count Differences Case 35->25 `', '`\n') cat('# WGS (N-L Span) `', '`\n') counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21_2"] cat('# N450 `', '`\n') counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21_2"] # Case Case35 -> Case31 cat('# Base Count Differences Case 35->31 `', '`\n') cat('# WGS (N-L Span) `', '`\n') counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21"] cat('# N450 `', '`\n') counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21"] # Case Case35 -> Case30 #Not in the matrix # Case Case35 -> Case47 cat('# Base Count Differences Case 35->47 `', '`\n') cat('# WGS (N-L Span) `', '`\n') counts_matrix_WGS["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21_3"] cat('# N450 `', '`\n') counts_matrix_N450["MVs_Virginia.USA_39.21", "MVs_Virginia.USA_36.21_3"] ```