Data analysis for:
Dylan J. Padilla Perez, 2023. Geographic and seasonal variation of the for gene reveal signatures of local adaptation in Drosophila melanogaster. Dylan J. Padilla Perez, School of Life Sciences, Arizona State University, Tempe, AZ 85287, USA.Library
library(adegenet)
library(ade4)
library(car)
library(ComplexHeatmap)
library(data.table)
library(dartR)
library(ecodist)
library(hierfstat)
library(gplots)
library(LEA)
library(lfmm)
library(maps)
library(mapplots)
library(MASS)
library(PBSmapping)
library(pegas)
library(poppr)
library(qvalue)
library(randomcoloR)
library(raster)
library(reshape)
library(rworldmap)
library(scales)
library(SeqArray)
library(SeqVarTools)
library(shape)
library(SNPRelate)
library(stringr)
library(vcfR)
library(vegan)
library(xtable)
Session information
R.version
_
platform x86_64-apple-darwin17.0
arch x86_64
os darwin17.0
system x86_64, darwin17.0
status
major 4
minor 2.2
year 2022
month 10
day 31
svn rev 83211
language R
version.string R version 4.2.2 (2022-10-31)
nickname Innocent and Trusting
sessionInfo()
R version 4.2.2 (2022-10-31)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.7
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] xtable_1.8-4 vegan_2.6-4 lattice_0.21-8
[4] permute_0.9-7 vcfR_1.14.0 stringr_1.5.0
[7] SNPRelate_1.32.2 shape_1.4.6 SeqVarTools_1.36.0
[10] SeqArray_1.38.0 gdsfmt_1.34.1 scales_1.2.1
[13] rworldmap_1.3-6 reshape_0.8.9 raster_3.6-20
[16] sp_1.6-0 randomcoloR_1.1.0.1 qvalue_2.30.0
[19] poppr_2.9.4 pegas_1.2 ape_5.7-1
[22] PBSmapping_2.73.2 MASS_7.3-59 mapplots_1.5.1
[25] maps_3.4.1 lfmm_1.1 LEA_3.10.2
[28] gplots_3.1.3 hierfstat_0.5-11 ecodist_2.0.9
[31] dartR_2.7.2 dartR.data_1.0.2 dplyr_1.1.2
[34] ggplot2_3.4.2 data.table_1.14.8 ComplexHeatmap_2.14.0
[37] car_3.1-2 carData_3.0-5 adegenet_2.1.10
[40] ade4_1.7-22 rmarkdown_2.21
loaded via a namespace (and not attached):
[1] utf8_1.2.3 R.utils_2.12.2 tidyselect_1.2.0
[4] combinat_0.0-8 Rtsne_0.16 maptools_1.1-6
[7] StAMPP_1.6.3 GWASExactHW_1.01 munsell_0.5.0
[10] codetools_0.2-19 withr_2.5.0 colorspace_2.1-0
[13] Biobase_2.58.0 knitr_1.42 stats4_4.2.2
[16] RgoogleMaps_1.4.5.3 logistf_1.24.1 GenomeInfoDbData_1.2.9
[19] gap.datasets_0.0.5 vctrs_0.6.2 generics_0.1.3
[22] xfun_0.39 R6_2.5.1 doParallel_1.0.17
[25] GenomeInfoDb_1.34.9 clue_0.3-64 fields_14.1
[28] bitops_1.0-7 cachem_1.0.8 promises_1.2.0.1
[31] pinfsc50_1.2.0 gtable_0.3.3 formula.tools_1.7.1
[34] spam_2.9-1 rlang_1.1.1 calibrate_1.7.7
[37] GlobalOptions_0.1.2 splines_4.2.2 rgdal_1.6-6
[40] broom_1.0.4 yaml_2.3.7 reshape2_1.4.4
[43] abind_1.4-5 backports_1.4.1 httpuv_1.6.9
[46] tools_4.2.2 ellipsis_0.3.2 jquerylib_0.1.4
[49] RColorBrewer_1.1-3 BiocGenerics_0.44.0 Rcpp_1.0.10
[52] plyr_1.8.8 zlibbioc_1.44.0 purrr_1.0.1
[55] RCurl_1.98-1.12 GetoptLong_1.0.5 viridis_0.6.3
[58] S4Vectors_0.36.2 cluster_2.1.4 magrittr_2.0.3
[61] genetics_1.3.8.1.3 circlize_0.4.15 mvtnorm_1.1-3
[64] matrixStats_0.63.0 patchwork_1.1.2 mime_0.12
[67] evaluate_0.20 IRanges_2.32.0 gridExtra_2.3
[70] compiler_4.2.2 tibble_3.2.1 mice_3.15.0
[73] KernSmooth_2.23-20 V8_4.3.0 crayon_1.5.2
[76] gdistance_1.6.2 R.oo_1.25.0 htmltools_0.5.5
[79] mgcv_1.8-42 later_1.3.0 tidyr_1.3.0
[82] DBI_1.1.3 PopGenReport_3.0.7 boot_1.3-28.1
[85] Matrix_1.5-4 cli_3.6.1 R.methodsS3_1.8.2
[88] gdata_2.18.0.1 parallel_4.2.2 dotCall64_1.0-2
[91] igraph_1.4.2 GenomicRanges_1.50.2 pkgconfig_2.0.3
[94] foreign_0.8-84 terra_1.7-29 foreach_1.5.2
[97] bslib_0.4.2 XVector_0.38.0 digest_0.6.31
[100] polysat_1.7-7 Biostrings_2.66.0 operator.tools_1.6.3
[103] curl_5.0.0 gap_1.5-1 shiny_1.7.4
[106] gtools_3.9.4 rjson_0.2.21 lifecycle_1.0.3
[109] nlme_3.1-162 dismo_1.3-9 jsonlite_1.8.4
[112] seqinr_4.2-30 viridisLite_0.4.2 fansi_1.0.4
[115] pillar_1.9.0 GGally_2.1.2 fastmap_1.1.1
[118] glue_1.6.2 mmod_1.3.3 png_0.1-8
[121] iterators_1.0.14 stringi_1.7.12 sass_0.4.5
[124] caTools_1.18.2
Quality control and SNP isolation
## load meta-data file
samps <- fread("samps_10Nov2020.csv")
## open GDS for common SNPs (PoolSNP)
genofile <- seqOpen("dest.all.PoolSNP.001.50.10Nov2020.ann.gds", allow.duplicate = TRUE)
## common SNP.dt
snp.dt <- data.table(chr = seqGetData(genofile, "chromosome"),
pos = seqGetData(genofile, "position"),
nAlleles = seqGetData(genofile, "$num_allele"),
id = seqGetData(genofile, "variant.id"),
genotype = seqGetData(genofile, "allele"))
snp.dt <- snp.dt[nAlleles == 2]
seqSetFilter(genofile, snp.dt$id)
## # of selected variants: 4,281,164
## filter to target
snp.tmp <- data.table(chr ="2L", pos = 3622074:3656953)
setkey(snp.tmp, chr, pos)
setkey(snp.dt, chr, pos)
seqSetFilter(genofile, variant.id=snp.dt[J(snp.tmp), nomatch = 0]$id)
## # of selected variants: 1,613
## get annotations
message("Annotations")
tmp <- seqGetData(genofile, "annotation/info/ANN")
len1 <- tmp$length
len2 <- tmp$data
snp.dt1 <- data.table(len = rep(len1, times = len1), ann = len2, id = rep(snp.dt[J(snp.tmp), nomatch = 0]$id, times = len1))
## Extract data between the 2nd and third | symbol
snp.dt1[ ,class := tstrsplit(snp.dt1$ann,"\\|")[[2]]]
snp.dt1[ ,gene := tstrsplit(snp.dt1$ann,"\\|")[[4]]]
## Collapse additional annotations to original SNP vector length
snp.dt1.an <- snp.dt1[,list(n = length(class), col = paste(class, collapse = ","), gene = paste(gene, collapse = ",")), list(variant.id = id)]
snp.dt1.an[,col := tstrsplit(snp.dt1.an$col,"\\,")[[1]]]
snp.dt1.an[,gene := tstrsplit(snp.dt1.an$gene,"\\,")[[1]]]
## get frequencies
message("Allele Freqs")
ad <- seqGetData(genofile, "annotation/format/AD")
dp <- seqGetData(genofile, "annotation/format/DP")
af <- data.table(ad = expand.grid(ad$data)[,1],
dp = expand.grid(dp)[,1],
sampleId = rep(seqGetData(genofile, "sample.id"), dim(ad$data)[2]),
variant.id = rep(seqGetData(genofile, "variant.id"), each = dim(ad$data)[1]))
## merge them together
message("merge")
afi <- merge(af, snp.dt1.an, by = "variant.id")
afi <- merge(afi, snp.dt, by.x = "variant.id", by.y = "id")
afi[ , af := ad/dp]
## calculate effective read-depth
afis <- merge(afi, samps, by = "sampleId")
afis[chr == "X", nEff := round((dp*nFlies - 1)/(dp+nFlies))]
afis[chr != "X", nEff := round((dp*2*nFlies - 1)/(dp+2*nFlies))]
afis[ ,af_nEff := round(af*nEff)/nEff]
## subsetting dataset
names(afis)
## [1] "sampleId" "variant.id" "ad" "dp"
## [5] "n" "col" "gene" "chr"
## [9] "pos" "nAlleles" "genotype" "af"
## [13] "locality" "country" "city" "collectionDate"
## [17] "DateExact" "lat" "long" "season"
## [21] "type" "continent" "set" "nFlies"
## [25] "SRA_accession" "SRA_experiment" "Model" "collector"
## [29] "sampleType" "year" "yday" "stationId"
## [33] "dist_km" "In(2L)t" "In(2R)Ns" "In(3L)P"
## [37] "In(3R)C" "In(3R)K" "In(3R)Mo" "In(3R)Payne"
## [41] "status" "bio1" "bio2" "bio3"
## [45] "bio4" "bio5" "bio6" "bio7"
## [49] "bio8" "bio9" "bio10" "bio11"
## [53] "bio12" "bio13" "bio14" "bio15"
## [57] "bio16" "bio17" "bio18" "bio19"
## [61] "tmin1" "tmin2" "tmin3" "tmin4"
## [65] "tmin5" "tmin6" "tmin7" "tmin8"
## [69] "tmin9" "tmin10" "tmin11" "tmin12"
## [73] "tmax1" "tmax2" "tmax3" "tmax4"
## [77] "tmax5" "tmax6" "tmax7" "tmax8"
## [81] "tmax9" "tmax10" "tmax11" "tmax12"
## [85] "prec1" "prec2" "prec3" "prec4"
## [89] "prec5" "prec6" "prec7" "prec8"
## [93] "prec9" "prec10" "prec11" "prec12"
## [97] "propSimNorm" "sex" "nEff" "af_nEff"
season.dat <- as.data.frame(afis[ , c(21, 30, 1, 13, 14, 15, 19, 18, 20, 2, 11, 12, 6, 22, 99)])
str(season.dat)
## 'data.frame': 438736 obs. of 15 variables:
## $ type : chr "pooled" "pooled" "pooled" "pooled" ...
## $ year : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
## $ sampleId : chr "AT_Mau_14_01" "AT_Mau_14_01" "AT_Mau_14_01" "AT_Mau_14_01" ...
## $ locality : chr "AT_Mau" "AT_Mau" "AT_Mau" "AT_Mau" ...
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ city : chr "Mauternbach" "Mauternbach" "Mauternbach" "Mauternbach" ...
## $ long : num 15.6 15.6 15.6 15.6 15.6 ...
## $ lat : num 48.4 48.4 48.4 48.4 48.4 ...
## $ season : chr "spring" "spring" "spring" "spring" ...
## $ variant.id: int 162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
## $ genotype : chr "A,C" "A,C" "C,T" "T,C" ...
## $ af : num 1 0 0.1569 0.0196 NA ...
## $ col : chr "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" ...
## $ continent : chr "Europe" "Europe" "Europe" "Europe" ...
## $ nEff : num 28 26 31 31 NA 36 36 33 24 30 ...
head(season.dat)
## type year sampleId locality country city long lat season
## 1 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## 2 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## 3 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## 4 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## 5 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## 6 pooled 2014 AT_Mau_14_01 AT_Mau Austria Mauternbach 15.56 48.375 spring
## variant.id genotype af col continent nEff
## 1 162333 A,C 1.00000000 3_prime_UTR_variant Europe 28
## 2 162334 A,C 0.00000000 3_prime_UTR_variant Europe 26
## 3 162335 C,T 0.15686275 3_prime_UTR_variant Europe 31
## 4 162336 T,C 0.01960784 3_prime_UTR_variant Europe 31
## 5 162337 C,T NA 3_prime_UTR_variant Europe NA
## 6 162338 C,T 0.00000000 3_prime_UTR_variant Europe 36
dim(season.dat)
## [1] 438736 15
season.dat$season[season.dat$season == "frost"] <- "fall"
season.dat$season <- as.factor(season.dat$season)
season.filter.dat <- data.frame()
localities <- levels(as.factor(season.dat$locality))
seasons <- c("fall", "spring")
## getting samples collected at least once during spring and winter
for(loc in localities){
dft <- season.dat[season.dat$locality == loc, ]
if(all(seasons %in% unique(dft$season))){
season.filter.dat <- rbind(season.filter.dat, dft)
}
}
dim(season.filter.dat)
## [1] 285501 15
dim(na.omit(season.filter.dat))
## [1] 269454 15
str(season.filter.dat)
## 'data.frame': 285501 obs. of 15 variables:
## $ type : chr "pooled" "pooled" "pooled" "pooled" ...
## $ year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
## $ sampleId : chr "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
## $ locality : chr "AT_gr" "AT_gr" "AT_gr" "AT_gr" ...
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ city : chr "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
## $ long : num 16.4 16.4 16.4 16.4 16.4 ...
## $ lat : num 48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 ...
## $ season : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
## $ variant.id: int 162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
## $ genotype : chr "A,C" "A,C" "C,T" "T,C" ...
## $ af : num 0.9773 NA 0.2273 0 0.0244 ...
## $ col : chr "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" ...
## $ continent : chr "Europe" "Europe" "Europe" "Europe" ...
## $ nEff : num 29 NA 29 33 28 28 27 25 27 25 ...
mat1 <- season.filter.dat[ , c(3, 6, 5, 9, 10, 14, 12, 15)]
names(mat1)
## [1] "sampleId" "city" "country" "season" "variant.id"
## [6] "continent" "af" "nEff"
str(mat1)
## 'data.frame': 285501 obs. of 8 variables:
## $ sampleId : chr "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
## $ city : chr "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ season : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
## $ variant.id: int 162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
## $ continent : chr "Europe" "Europe" "Europe" "Europe" ...
## $ af : num 0.9773 NA 0.2273 0 0.0244 ...
## $ nEff : num 29 NA 29 33 28 28 27 25 27 25 ...
dim(mat1)
## [1] 285501 8
mat1 <- mat1[!is.na(mat1$nEff), ]
dim(mat1)
## [1] 269454 8
mat1 <- mat1[mat1$nEff >= 28, ] ## Applying a Neff filter of 28
dim(mat1)
## [1] 212017 8
mat1 <- mat1[ , -8]
str(mat1)
## 'data.frame': 212017 obs. of 7 variables:
## $ sampleId : chr "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
## $ city : chr "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ season : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
## $ variant.id: int 162333 162335 162336 162337 162338 162350 162351 162352 162353 162357 ...
## $ continent : chr "Europe" "Europe" "Europe" "Europe" ...
## $ af : num 0.9773 0.2273 0 0.0244 0 ...
## Changing name of cities
mat1$city <- as.factor(mat1$city)
levels(mat1$city)
## [1] "Akaa" "Athens"
## [3] "Barcelona" "Benton Harbor"
## [5] "Broggingen" "Brzezina"
## [7] "Chalet \xe0 Gobet" "Charlottesville"
## [9] "Charlotttesville" "Chornobyl (Cooling pond)"
## [11] "Chornobyl Applegarden" "Chornobyl Applegarden (New)"
## [13] "Chornobyl Kopachi" "Chornobyl Polisske"
## [15] "Chornobyl Yaniv" "Cross Plains"
## [17] "Drogobych" "Esparto"
## [19] "Gimenells" "Gotheron"
## [21] "Gross-Enzersdorf" "Homestead"
## [23] "Ithaca" "Kalna"
## [25] "Karensminde" "Karensminde Orchard"
## [27] "Kharkiv" "Kyiv"
## [29] "Kyiv (Vyshneve)" "Lancaster"
## [31] "Linvilla" "Market Harborough"
## [33] "Mauternbach" "Munich"
## [35] "Odesa" "Odessa"
## [37] "Sheffield" "Slankamen. Vinogradi"
## [39] "State College" "Sudbury"
## [41] "Topeka" "Tuolumne"
## [43] "valday" "Valday"
## [45] "Viltain" "Yesiloz"
## [47] "Yesiloz TR13" "Yesiloz TR14"
## [49] "Yesiloz TR15" "Yesiloz TR16"
## [51] "Yesiloz TR17" "Yesiloz TR18"
levels(mat1$city) <- gsub("valday", "Valday", levels(mat1$city))
levels(mat1$city) <- gsub("Odesa", "Odessa", levels(mat1$city))
levels(mat1$city) <- gsub("Charlotttesville", "Charlottesville", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR13", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR14", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR15", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR16", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR17", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR18", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Karensminde Orchard", "Karensminde", levels(mat1$city))
levels(mat1$city) <- gsub("Slankamen. Vinogradi", "Slankamen-Vinogradi", levels(mat1$city))
levels(mat1$city) <- gsub("Chalet \xe0 Gobet", "Chalet-A-Gobet", levels(mat1$city))
levels(mat1$city)
## [1] "Akaa" "Athens"
## [3] "Barcelona" "Benton Harbor"
## [5] "Broggingen" "Brzezina"
## [7] "Chalet-A-Gobet" "Charlottesville"
## [9] "Chornobyl (Cooling pond)" "Chornobyl Applegarden"
## [11] "Chornobyl Applegarden (New)" "Chornobyl Kopachi"
## [13] "Chornobyl Polisske" "Chornobyl Yaniv"
## [15] "Cross Plains" "Drogobych"
## [17] "Esparto" "Gimenells"
## [19] "Gotheron" "Gross-Enzersdorf"
## [21] "Homestead" "Ithaca"
## [23] "Kalna" "Karensminde"
## [25] "Kharkiv" "Kyiv"
## [27] "Kyiv (Vyshneve)" "Lancaster"
## [29] "Linvilla" "Market Harborough"
## [31] "Mauternbach" "Munich"
## [33] "Odessa" "Sheffield"
## [35] "Slankamen-Vinogradi" "State College"
## [37] "Sudbury" "Topeka"
## [39] "Tuolumne" "Valday"
## [41] "Viltain" "Yesiloz"
levels(mat1$city)[c(9, 10, 11, 12, 13, 14)] <- "Chornobyl"
levels(mat1$city)
## [1] "Akaa" "Athens" "Barcelona"
## [4] "Benton Harbor" "Broggingen" "Brzezina"
## [7] "Chalet-A-Gobet" "Charlottesville" "Chornobyl"
## [10] "Cross Plains" "Drogobych" "Esparto"
## [13] "Gimenells" "Gotheron" "Gross-Enzersdorf"
## [16] "Homestead" "Ithaca" "Kalna"
## [19] "Karensminde" "Kharkiv" "Kyiv"
## [22] "Kyiv (Vyshneve)" "Lancaster" "Linvilla"
## [25] "Market Harborough" "Mauternbach" "Munich"
## [28] "Odessa" "Sheffield" "Slankamen-Vinogradi"
## [31] "State College" "Sudbury" "Topeka"
## [34] "Tuolumne" "Valday" "Viltain"
## [37] "Yesiloz"
levels(mat1$city)[22] <- "Kyiv"
levels(mat1$city)
## [1] "Akaa" "Athens" "Barcelona"
## [4] "Benton Harbor" "Broggingen" "Brzezina"
## [7] "Chalet-A-Gobet" "Charlottesville" "Chornobyl"
## [10] "Cross Plains" "Drogobych" "Esparto"
## [13] "Gimenells" "Gotheron" "Gross-Enzersdorf"
## [16] "Homestead" "Ithaca" "Kalna"
## [19] "Karensminde" "Kharkiv" "Kyiv"
## [22] "Lancaster" "Linvilla" "Market Harborough"
## [25] "Mauternbach" "Munich" "Odessa"
## [28] "Sheffield" "Slankamen-Vinogradi" "State College"
## [31] "Sudbury" "Topeka" "Tuolumne"
## [34] "Valday" "Viltain" "Yesiloz"
levels(mat1$city)[29] <- "Slankamenacki-Vinogradi"
levels(mat1$city)
## [1] "Akaa" "Athens"
## [3] "Barcelona" "Benton Harbor"
## [5] "Broggingen" "Brzezina"
## [7] "Chalet-A-Gobet" "Charlottesville"
## [9] "Chornobyl" "Cross Plains"
## [11] "Drogobych" "Esparto"
## [13] "Gimenells" "Gotheron"
## [15] "Gross-Enzersdorf" "Homestead"
## [17] "Ithaca" "Kalna"
## [19] "Karensminde" "Kharkiv"
## [21] "Kyiv" "Lancaster"
## [23] "Linvilla" "Market Harborough"
## [25] "Mauternbach" "Munich"
## [27] "Odessa" "Sheffield"
## [29] "Slankamenacki-Vinogradi" "State College"
## [31] "Sudbury" "Topeka"
## [33] "Tuolumne" "Valday"
## [35] "Viltain" "Yesiloz"
dim(mat1)
## [1] 212017 7
str(mat1)
## 'data.frame': 212017 obs. of 7 variables:
## $ sampleId : chr "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
## $ city : Factor w/ 36 levels "Akaa","Athens",..: 15 15 15 15 15 15 15 15 15 15 ...
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ season : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
## $ variant.id: int 162333 162335 162336 162337 162338 162350 162351 162352 162353 162357 ...
## $ continent : chr "Europe" "Europe" "Europe" "Europe" ...
## $ af : num 0.9773 0.2273 0 0.0244 0 ...
mat1 <- mat1[!mat1$country == "United Kingdom", ] ## The united kingdom only has 4 pooled samples, so I removed it from the analyses
mat2 <- reshape(mat1, idvar = c("sampleId", "city", "country", "season", "continent"), timevar = "variant.id", direction = "wide")
mat2[1:6, 1:10]
## sampleId city country season continent af.162333
## 9679 AT_gr_12_fall Gross-Enzersdorf Austria fall Europe 0.9772727
## 11292 AT_gr_12_spring Gross-Enzersdorf Austria spring Europe 0.8888889
## 1 AT_Mau_14_01 Mauternbach Austria spring Europe 1.0000000
## 1617 AT_Mau_14_02 Mauternbach Austria fall Europe NA
## 3227 AT_Mau_15_50 Mauternbach Austria spring Europe 0.7714286
## 4840 AT_Mau_15_51 Mauternbach Austria fall Europe 0.6984127
## af.162335 af.162336 af.162337 af.162338
## 9679 0.2272727 0.00000000 0.02439024 0
## 11292 NA 0.00000000 NA NA
## 1 0.1568627 0.01960784 NA 0
## 1617 NA 0.00000000 0.00000000 0
## 3227 0.1323529 0.00000000 0.00000000 0
## 4840 0.2656250 0.01282051 0.00000000 0
dim(mat2)
## [1] 170 1614
mat2$city <- as.character(mat2$city)
mat2$season <- as.character(mat2$season)
mat2 <- mat2[mat2$city != "Homestead", ]
mat2 <- mat2[mat2$city != "Athens", ]
mat2 <- mat2[mat2$city != "Sudbury", ]
mat2 <- mat2[mat2$city != "Brzezina", ]
mat2 <- mat2[mat2$city != "Kalna", ]
mat2 <- mat2[mat2$city != "Slankamenacki-Vinogradi", ]
mat2 <- mat2[mat2$city != "Topeka", ]
mat2 <- mat2[mat2$city != "Chalet-A-Gobet", ]
usa <- c("Lancaster", "Ithaca", "Cross Plains", "Benton Harbor", "Linvilla", "State College", "Tuolumne", "Esparto", "Charlottesville")
for(i in usa){
mat2$country[mat2$city == i] <- i
}
mat2$country <- as.factor(mat2$country)
levels(mat2$country)
## [1] "Austria" "Benton Harbor" "Charlottesville" "Cross Plains"
## [5] "Denmark" "Esparto" "Finland" "France"
## [9] "Germany" "Ithaca" "Lancaster" "Linvilla"
## [13] "Russia" "Spain" "State College" "Tuolumne"
## [17] "Turkey" "Ukraine"
levels(mat2$country)[c(2, 4)] <- "MI-WI"
levels(mat2$country)
## [1] "Austria" "MI-WI" "Charlottesville" "Denmark"
## [5] "Esparto" "Finland" "France" "Germany"
## [9] "Ithaca" "Lancaster" "Linvilla" "Russia"
## [13] "Spain" "State College" "Tuolumne" "Turkey"
## [17] "Ukraine"
levels(mat2$country)[c(9, 10)] <- "NY-MA"
levels(mat2$country)
## [1] "Austria" "MI-WI" "Charlottesville" "Denmark"
## [5] "Esparto" "Finland" "France" "Germany"
## [9] "NY-MA" "Linvilla" "Russia" "Spain"
## [13] "State College" "Tuolumne" "Turkey" "Ukraine"
levels(mat2$country)[c(10, 13)] <- "PA"
levels(mat2$country)
## [1] "Austria" "MI-WI" "Charlottesville" "Denmark"
## [5] "Esparto" "Finland" "France" "Germany"
## [9] "NY-MA" "PA" "Russia" "Spain"
## [13] "Tuolumne" "Turkey" "Ukraine"
mat3 <- as.data.frame(getGenotypeAlleles(genofile)) ## extracting genotypes from the GDS file
mat3[1:5, 1:5]
## 162333 162334 162335 162336 162337
## AT_Mau_14_01 C/C A/A C/T T/C <NA>
## AT_Mau_14_02 A/C A/A C/T T/T C/C
## AT_Mau_15_50 A/C A/A C/T T/T C/C
## AT_Mau_15_51 A/C <NA> C/T T/C C/C
## AT_See_14_44 A/C A/C C/T T/T C/C
for(i in 1:ncol(mat3)){
mat3[ , i] <- gsub("/", "", mat3[, i])
}
mat3[1:5, 1:5]
## 162333 162334 162335 162336 162337
## AT_Mau_14_01 CC AA CT TC <NA>
## AT_Mau_14_02 AC AA CT TT CC
## AT_Mau_15_50 AC AA CT TT CC
## AT_Mau_15_51 AC <NA> CT TC CC
## AT_See_14_44 AC AC CT TT CC
mat3$ind <- rownames(mat3)
dim(mat3)
## [1] 272 1614
mat3[1:5, 1610:1614]
## 163994 163995 163996 163997 ind
## AT_Mau_14_01 GA GG GC AA AT_Mau_14_01
## AT_Mau_14_02 GA GG GC AT AT_Mau_14_02
## AT_Mau_15_50 GA GG GC AA AT_Mau_15_50
## AT_Mau_15_51 GA GG GC AA AT_Mau_15_51
## AT_See_14_44 GA GA GC AA AT_See_14_44
mat4 <- mat3[mat3$ind %in% mat2$sampleId, ]
dim(mat4)
## [1] 152 1614
mat4[1:5, 1600:1614]
## 163983 163984 163985 163986 163987 163988 163989 163991 163992
## AT_Mau_14_01 AA AG TT CG AG CC AC CC AG
## AT_Mau_14_02 AA AG TT CG AG CC AC CC AA
## AT_Mau_15_50 AA AG TC CG AG CT AC CG AG
## AT_Mau_15_51 AA AG TT CG AG CC AC CG AA
## AT_gr_12_fall AA AG TT CG AG CT AC CG AA
## 163993 163994 163995 163996 163997 ind
## AT_Mau_14_01 TC GA GG GC AA AT_Mau_14_01
## AT_Mau_14_02 TC GA GG GC AT AT_Mau_14_02
## AT_Mau_15_50 TC GA GG GC AA AT_Mau_15_50
## AT_Mau_15_51 TC GA GG GC AA AT_Mau_15_51
## AT_gr_12_fall TC GA GG GC AA AT_gr_12_fall
same_ord <- mat2[ , c(1, 2, 3, 4)]
colnames(same_ord) <- c("ind", "city", "country", "season")
mat5 <- merge(mat4, same_ord, by = "ind")
dim(mat5)
## [1] 152 1617
mat5[1:5, 1:5]
## ind 162333 162334 162335 162336
## 1 AT_gr_12_fall AC <NA> CT TT
## 2 AT_gr_12_spring AC AA CT TT
## 3 AT_Mau_14_01 CC AA CT TC
## 4 AT_Mau_14_02 AC AA CT TT
## 5 AT_Mau_15_50 AC AA CT TT
rownames(mat5) <- mat5[ , 1]
dim(mat5)
## [1] 152 1617
mat5[1:5, 1610:1617]
## 163993 163994 163995 163996 163997 city country
## AT_gr_12_fall TC GA GG GC AA Gross-Enzersdorf Austria
## AT_gr_12_spring TC GA GG GC AA Gross-Enzersdorf Austria
## AT_Mau_14_01 TC GA GG GC AA Mauternbach Austria
## AT_Mau_14_02 TC GA GG GC AT Mauternbach Austria
## AT_Mau_15_50 TC GA GG GC AA Mauternbach Austria
## season
## AT_gr_12_fall fall
## AT_gr_12_spring spring
## AT_Mau_14_01 spring
## AT_Mau_14_02 fall
## AT_Mau_15_50 spring
mat6 <- mat5[ , -c(1, 1615, 1616, 1617)]
dim(mat6)
## [1] 152 1613
mat6[1:5, 1:5]
## 162333 162334 162335 162336 162337
## AT_gr_12_fall AC <NA> CT TT CT
## AT_gr_12_spring AC AA CT TT CC
## AT_Mau_14_01 CC AA CT TC <NA>
## AT_Mau_14_02 AC AA CT TT CC
## AT_Mau_15_50 AC AA CT TT CC
mat6[1:5, 1605:1613]
## 163988 163989 163991 163992 163993 163994 163995 163996 163997
## AT_gr_12_fall CT AC CG AA TC GA GG GC AA
## AT_gr_12_spring CC AA CG AA TC GA GG GC AA
## AT_Mau_14_01 CC AC CC AG TC GA GG GC AA
## AT_Mau_14_02 CC AC CC AA TC GA GG GC AT
## AT_Mau_15_50 CT AC CG AG TC GA GG GC AA
fly_gen <- df2genind(mat6, ploidy = 2, ind.names = rownames(mat6), pop = mat5$country, sep = "")
fly_gen
## /// GENIND OBJECT /////////
##
## // 152 individuals; 1,613 loci; 3,225 alleles; size: 2.7 Mb
##
## // Basic content
## @tab: 152 x 3225 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 1-2)
## @loc.fac: locus factor for the 3225 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: df2genind(X = mat6, sep = "", ind.names = rownames(mat6), pop = mat5$country,
## ploidy = 2)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
fly_gen$tab[1:5, 1:5]
## 162333.A 162333.C 162334.A 162334.C 162335.C
## AT_gr_12_fall 1 1 NA NA 1
## AT_gr_12_spring 1 1 2 0 1
## AT_Mau_14_01 0 2 2 0 1
## AT_Mau_14_02 1 1 2 0 1
## AT_Mau_15_50 1 1 2 0 1
summary(fly_gen$pop)
## Austria MI-WI Charlottesville Denmark Esparto
## 6 9 6 5 5
## Finland France Germany NY-MA PA
## 6 10 12 6 18
## Russia Spain Tuolumne Turkey Ukraine
## 6 7 5 13 38
indmiss_fly <- propTyped(fly_gen, by = "ind")
indmiss_fly[which(indmiss_fly < 0.80)]
## PA_li_10_spring
## 0.7123373
barplot(indmiss_fly, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Genotypes", side = 1, line = 3.7)
fly_gen <- missingno(fly_gen, type = "geno", cutoff = 0.20) ## This is the genind file that I will use for all the analyses
fly_gen
## /// GENIND OBJECT /////////
##
## // 151 individuals; 1,613 loci; 3,225 alleles; size: 2.7 Mb
##
## // Basic content
## @tab: 151 x 3225 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 1-2)
## @loc.fac: locus factor for the 3225 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: .local(x = x, i = i, j = j, drop = drop)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
## quality control begins here
mlg(fly_gen) ## keep only polymorphic loci
## #############################
## # Number of Individuals: 151
## # Number of MLG: 151
## #############################
## [1] 151
isPoly(fly_gen) %>% summary
## Mode FALSE TRUE
## logical 1 1612
poly_loci <- names(which(isPoly(fly_gen) == TRUE))
fly_gen2 <- fly_gen[loc = poly_loci]
isPoly(fly_gen2) %>% summary
## Mode TRUE
## logical 1612
fly_gen2$loc.n.all <- fly_gen2$loc.n.all[which(fly_gen2$loc.n.all == 2)]
n <- names(which(nAll(fly_gen2) == 2))
fly_gen2 <- fly_gen2[loc = n]
fly_gen2
## /// GENIND OBJECT /////////
##
## // 151 individuals; 1,612 loci; 3,224 alleles; size: 2.7 Mb
##
## // Basic content
## @tab: 151 x 3224 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-2)
## @loc.fac: locus factor for the 3224 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
indmiss_fly2 <- propTyped(fly_gen2, by = "ind")
barplot(indmiss_fly2, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Genotypes", side = 1, line = 3.7)
locmiss_fly <- propTyped(fly_gen2, by = "loc")
locmiss_fly[which(locmiss_fly < 0.80)] ## print loci with < 80% complete genotypes
## 162334 162396 162412 162545 162546 162547 162549 162551
## 0.6225166 0.6092715 0.7880795 0.7947020 0.7682119 0.7947020 0.7748344 0.7880795
## 162553 162554 162555 162557 162560 162563 162587 162623
## 0.7880795 0.7814570 0.7880795 0.7947020 0.7814570 0.7947020 0.7748344 0.5298013
## 162746 162755 162756 162882 162892 162986 162987 162988
## 0.4238411 0.5562914 0.5562914 0.7748344 0.4238411 0.7152318 0.7152318 0.7284768
## 162989 162990 162992 163029 163072 163073 163074 163075
## 0.7284768 0.7284768 0.7549669 0.6291391 0.4834437 0.4370861 0.4834437 0.4834437
## 163076 163118 163119 163120 163121 163122 163123 163124
## 0.4834437 0.6688742 0.6688742 0.6688742 0.6688742 0.6688742 0.6158940 0.6158940
## 163125 163307 163363 163365 163366 163608 163648 163650
## 0.4370861 0.4768212 0.7947020 0.7947020 0.7483444 0.7350993 0.5165563 0.5827815
## 163692 163715 163716 163717 163765 163821 163908 163910
## 0.4569536 0.7086093 0.7086093 0.7086093 0.6357616 0.6092715 0.5231788 0.5629139
## 163914 163915 163916 163917 163922 163958
## 0.6158940 0.5894040 0.5827815 0.5761589 0.6953642 0.7019868
barplot(locmiss_fly, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Locus", side = 1, line = 3)
fly_gen3 <- missingno(fly_gen2, type = "loci", cutoff = 0.20)
fly_gen3
## /// GENIND OBJECT /////////
##
## // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
##
## // Basic content
## @tab: 151 x 3100 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-2)
## @loc.fac: locus factor for the 3100 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: .local(x = x, i = i, j = j, drop = drop)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
locmiss_fly3 <- propTyped(fly_gen3, by = "loc")
locmiss_fly3[which(locmiss_fly3 < 0.80)]
## named numeric(0)
fly_gen3$tab[1:5, 1:5]
## 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall 1 1 1 1 2
## AT_gr_12_spring 1 1 1 1 2
## AT_Mau_14_01 0 2 1 1 1
## AT_Mau_14_02 1 1 1 1 2
## AT_Mau_15_50 1 1 1 1 2
min(ploidy(fly_gen3))
## [1] 2
max(ploidy(fly_gen3))
## [1] 2
summary(fly_gen3$pop)
## Austria MI-WI Charlottesville Denmark Esparto
## 6 9 6 5 5
## Finland France Germany NY-MA PA
## 6 10 12 6 17
## Russia Spain Tuolumne Turkey Ukraine
## 6 7 5 13 38
mlg(fly_gen3)
## #############################
## # Number of Individuals: 151
## # Number of MLG: 151
## #############################
## [1] 151
isPoly(fly_gen3) %>% summary
## Mode TRUE
## logical 1550
poly_loci3 <- names(which(isPoly(fly_gen3) == TRUE))
fly_gen3 <- fly_gen3[loc = poly_loci3]
isPoly(fly_gen3) %>% summary
## Mode TRUE
## logical 1550
barplot(locmiss_fly3, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Locus", side = 1, line = 3)
## computing basic stats
basic_fly <- basic.stats(fly_gen3, diploid = TRUE)
## testing for H-W equilibrium
## Chi-squared test: p-value
HWE.test <- data.frame(sapply(seppop(fly_gen3),
function(ls) pegas::hw.test(ls, B = 0)[ , 3]))
HWE.test.chisq <- t(data.matrix(HWE.test))
HWE.test.chisq[1:5, 1:5]
## 162333 162335 162336 162337 162338
## Austria 0.08018122 0.01430588 0.6242061 0.8037847 1.0000000
## MI.WI 0.08968602 0.04722090 1.0000000 1.0000000 0.5485062
## Charlottesville 0.23013934 0.02534732 1.0000000 0.8237839 0.8037847
## Denmark 0.33790402 0.02534732 1.0000000 0.5761501 1.0000000
## Esparto 0.23013934 0.04550026 1.0000000 0.7750970 0.8037847
## Monte Carlo: p-value
HWE.test <- data.frame(sapply(seppop(fly_gen3),
function(ls) pegas::hw.test(ls, B = 1000)[ , 4]))
HWE.test.MC <- t(data.matrix(HWE.test))
HWE.test.MC[1:5, 1:5]
## 162333 162335 162336 162337 162338
## Austria 0.359 0.083 1 1 1
## MI.WI 0.411 0.165 1 1 1
## Charlottesville 1.000 0.115 1 1 1
## Denmark 1.000 0.123 1 1 1
## Esparto 1.000 0.299 1 1 1
alpha <- 0.05
Prop.loci.out.of.HWE <- data.frame(Chisq = apply(HWE.test.chisq < alpha, 2, mean), MC = apply(HWE.test.MC < alpha, 2, mean))
Prop.loci.out.of.HWE[1:10, ]
## Chisq MC
## 162333 0.3333333 0.2666667
## 162335 1.0000000 0.4000000
## 162336 0.0000000 0.0000000
## 162337 0.0000000 0.0000000
## 162338 0.0000000 0.0000000
## 162339 0.0000000 0.0000000
## 162340 0.0000000 0.0000000
## 162341 1.0000000 0.4666667
## 162342 0.0000000 0.0000000
## 162343 0.0000000 0.0000000
## "False discovery rate" correction for the number of tests. Here I use the function ‘p.adjust’ with the argument method = "fdr" to adjust the p-values from the previous tests
Chisq.fdr <- matrix(p.adjust(HWE.test.chisq, method = "fdr"),
nrow = nrow(HWE.test.chisq))
MC.fdr <- matrix(p.adjust(HWE.test.MC, method = "fdr"),
nrow = nrow(HWE.test.MC))
Prop.loci.out.of.HWE <- data.frame(Chisq = apply(HWE.test.chisq<alpha, 2, mean),
MC = apply(HWE.test.MC<alpha, 2, mean),
Chisq.fdr = apply(Chisq.fdr<alpha, 2, mean),
MC.fdr = apply(MC.fdr<alpha, 2, mean))
head(Prop.loci.out.of.HWE)
## Chisq MC Chisq.fdr MC.fdr
## 162333 0.3333333 0.2666667 0.2666667 0.1333333
## 162335 1.0000000 0.4000000 0.6000000 0.2666667
## 162336 0.0000000 0.0000000 0.0000000 0.0000000
## 162337 0.0000000 0.0000000 0.0000000 0.0000000
## 162338 0.0000000 0.0000000 0.0000000 0.0000000
## 162339 0.0000000 0.0000000 0.0000000 0.0000000
loci <- data.frame()
idx <- 1
for(i in 1:nrow(Prop.loci.out.of.HWE)){
fdr <- Prop.loci.out.of.HWE[i , ]
if(fdr$MC.fdr > 0.5){
loci <- rbind(fdr, loci)
}
}
loci ## none of the loci are consistenly out of HWE. Some of them are out of HWE in less than 50% of the populations.
## data frame with 0 columns and 0 rows
dim(loci)
## [1] 0 0
## check for linkage disequilibrium (LD)
LD <- as.genclone(fly_gen3)
LD
##
## This is a genclone object
## -------------------------
## Genotype information:
##
## 151 original multilocus genotypes
## 151 diploid individuals
## 1550 codominant loci
##
## Population information:
##
## 0 strata.
## 15 populations defined -
## Austria, MI-WI, Charlottesville, ..., Tuolumne, Turkey, Ukraine
quartz()
LD.general <- poppr::ia(LD, sample = 500)
LD.general
## Ia p.Ia rbarD p.rD
## 15.592940300 0.001996008 0.013771123 0.001996008
Computing pairwise Fst test
fly_fst <- genet.dist(fly_gen3, method = "WC84") %>% round(digits = 3)
fst.mat <- as.matrix(fly_fst)
fst.mat[fst.mat < 0] <- 0
fst.mat[1:10, 1:10]
## Austria MI-WI Charlottesville Denmark Esparto Finland France
## Austria 0.000 0.053 0.058 0.018 0.056 0.015 0.021
## MI-WI 0.053 0.000 0.005 0.049 0.012 0.049 0.042
## Charlottesville 0.058 0.005 0.000 0.054 0.017 0.057 0.045
## Denmark 0.018 0.049 0.054 0.000 0.049 0.024 0.018
## Esparto 0.056 0.012 0.017 0.049 0.000 0.054 0.041
## Finland 0.015 0.049 0.057 0.024 0.054 0.000 0.022
## France 0.021 0.042 0.045 0.018 0.041 0.022 0.000
## Germany 0.012 0.041 0.046 0.016 0.043 0.013 0.010
## NY-MA 0.044 0.006 0.003 0.040 0.009 0.044 0.033
## PA 0.053 0.004 0.004 0.050 0.013 0.051 0.044
## Germany NY-MA PA
## Austria 0.012 0.044 0.053
## MI-WI 0.041 0.006 0.004
## Charlottesville 0.046 0.003 0.004
## Denmark 0.016 0.040 0.050
## Esparto 0.043 0.009 0.013
## Finland 0.013 0.044 0.051
## France 0.010 0.033 0.044
## Germany 0.000 0.034 0.041
## NY-MA 0.034 0.000 0.002
## PA 0.041 0.002 0.000
tab <- gi2gl(fly_gen3, parallel = FALSE, verbose = NULL) ## converting genind object to genlight object
## Starting gi2gl
## Starting gl.compliance.check
## Processing genlight object with SNP data
## Checking coding of SNPs
## SNP data scored NA, 0, 1 or 2 confirmed
## Checking locus metrics and flags
## Recalculating locus metrics
## Checking for monomorphic loci
## No monomorphic loci detected
## Checking for loci with all missing data
## No loci with all missing data detected
## Checking whether individual names are unique.
## Checking for individual metrics
## Warning: Creating a slot for individual metrics
## Checking for population assignments
## Population assignments confirmed
## Spelling of coordinates checked and changed if necessary to
## lat/lon
## Completed: gl.compliance.check
## Completed: gi2gl
pval_fst <- gl.fst.pop(tab, nboots = 1000, percent = 95, nclusters = 1, verbose = NULL)
## Starting gl.fst.pop
## Processing genlight object with SNP data
## Completed: gl.fst.pop
fst <- as.matrix(as.dist(pval_fst$Fsts))
fst[fst < 0] <- 0
fst
## Austria Esparto Tuolumne Germany Denmark
## Austria 0.00000000 0.055703474 0.058895752 0.012410932 0.01800624
## Esparto 0.05570347 0.000000000 0.000000000 0.043366976 0.04899978
## Tuolumne 0.05889575 0.000000000 0.000000000 0.046380979 0.05326210
## Germany 0.01241093 0.043366976 0.046380979 0.000000000 0.01632085
## Denmark 0.01800624 0.048999783 0.053262097 0.016320855 0.00000000
## Spain 0.01644031 0.044356991 0.047779471 0.007609145 0.02535346
## Finland 0.01510726 0.054481851 0.060073773 0.013480562 0.02446933
## France 0.02086204 0.040896003 0.045503573 0.009973570 0.01796563
## NY-MA 0.04387058 0.008737692 0.007955468 0.033551239 0.04003517
## MI-WI 0.05336238 0.012416407 0.008962355 0.041320541 0.04880945
## PA 0.05310220 0.013471651 0.009573028 0.041313299 0.05015960
## Russia 0.01689966 0.053516381 0.060654501 0.020656074 0.02332836
## Turkey 0.01412191 0.060541318 0.062921149 0.018900400 0.02179383
## Ukraine 0.01416309 0.063774691 0.069481577 0.020327235 0.02369749
## Charlottesville 0.05819820 0.017272103 0.015222898 0.046035247 0.05402774
## Spain Finland France NY-MA MI-WI
## Austria 0.016440309 0.01510726 0.02086204 0.043870575 0.053362379
## Esparto 0.044356991 0.05448185 0.04089600 0.008737692 0.012416407
## Tuolumne 0.047779471 0.06007377 0.04550357 0.007955468 0.008962355
## Germany 0.007609145 0.01348056 0.00997357 0.033551239 0.041320541
## Denmark 0.025353461 0.02446933 0.01796563 0.040035167 0.048809450
## Spain 0.000000000 0.02001428 0.01567832 0.029541647 0.039313950
## Finland 0.020014275 0.00000000 0.02212788 0.043508846 0.048678641
## France 0.015678321 0.02212788 0.00000000 0.033457193 0.042244680
## NY-MA 0.029541647 0.04350885 0.03345719 0.000000000 0.005774646
## MI-WI 0.039313950 0.04867864 0.04224468 0.005774646 0.000000000
## PA 0.038958914 0.05145801 0.04367546 0.001936861 0.004351338
## Russia 0.031639501 0.01913517 0.02199934 0.047419819 0.052957069
## Turkey 0.026809159 0.01677821 0.02823027 0.049358359 0.054304445
## Ukraine 0.031831900 0.02245250 0.02881299 0.055798661 0.064183546
## Charlottesville 0.046471038 0.05728551 0.04450952 0.002951612 0.004506388
## PA Russia Turkey Ukraine Charlottesville
## Austria 0.053102204 0.016899657 0.01412191 0.014163086 0.058198201
## Esparto 0.013471651 0.053516381 0.06054132 0.063774691 0.017272103
## Tuolumne 0.009573028 0.060654501 0.06292115 0.069481577 0.015222898
## Germany 0.041313299 0.020656074 0.01890040 0.020327235 0.046035247
## Denmark 0.050159598 0.023328359 0.02179383 0.023697489 0.054027742
## Spain 0.038958914 0.031639501 0.02680916 0.031831900 0.046471038
## Finland 0.051458012 0.019135166 0.01677821 0.022452499 0.057285510
## France 0.043675456 0.021999335 0.02823027 0.028812985 0.044509519
## NY-MA 0.001936861 0.047419819 0.04935836 0.055798661 0.002951612
## MI-WI 0.004351338 0.052957069 0.05430445 0.064183546 0.004506388
## PA 0.000000000 0.058255707 0.05610334 0.066041832 0.004019586
## Russia 0.058255707 0.000000000 0.01717863 0.007515034 0.059645911
## Turkey 0.056103340 0.017178630 0.00000000 0.012939846 0.058036081
## Ukraine 0.066041832 0.007515034 0.01293985 0.000000000 0.067300473
## Charlottesville 0.004019586 0.059645911 0.05803608 0.067300473 0.000000000
## png("heatmap.png", width = 7, height = 7, units = "in", res = 300)
## pdf("heatmap.pdf")
my_palette <- colorRampPalette(c("blue", "red"))(n = 100)
heatmap.2(fst.mat, density.info = "none", trace = "none", scale = "none", cexRow = 0.7, cexCol = 0.7, key.title = "Fst", srtCol = 45, srtRow = 35, margins = c(8.5, 5), col = my_palette, symbreaks = TRUE)
## dev.off()
Figure 1. Heatmap depicting genetic differentiation among populations of D. melanogaster based on pairwise Fst values. In some cases, samples from adjacent localities were combined to avoid low sample sizes. Abbreviations are as follows: PA = Pennsylvania; MI-WI = Michigan and Wisconsin; NY-MA = New York and Massachusetts.
Genetic admixture proportions
##seqSetFilter(genofile, variant.id = rownames(snps), sample.id = rownames(indi))
##seqGDS2VCF(genofile, "my_vcf.gz", verbose = TRUE)
##outest <- read.vcfR("my_vcf.gz")
##dataout <- vcfR2genind(outest)
## exploring the data once again (do it all the time!)
fly_gen3
## /// GENIND OBJECT /////////
##
## // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
##
## // Basic content
## @tab: 151 x 3100 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-2)
## @loc.fac: locus factor for the 3100 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
fly_gen3$tab[1:5, 1:5]
## 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall 1 1 1 1 2
## AT_gr_12_spring 1 1 1 1 2
## AT_Mau_14_01 0 2 1 1 1
## AT_Mau_14_02 1 1 1 1 2
## AT_Mau_15_50 1 1 1 1 2
nLoc(fly_gen3) # number of loci
## [1] 1550
nPop(fly_gen3) # number of sites
## [1] 15
nInd(fly_gen3) # number of individuals
## [1] 151
summary(fly_gen3$pop) # sample size
## Austria MI-WI Charlottesville Denmark Esparto
## 6 9 6 5 5
## Finland France Germany NY-MA PA
## 6 10 12 6 17
## Russia Spain Tuolumne Turkey Ukraine
## 6 7 5 13 38
min(ploidy(fly_gen3))
## [1] 2
max(ploidy(fly_gen3))
## [1] 2
## gl2geno(tab, outfile = "gl_geno", outpath = getwd(), verbose = NULL) ## Converting genlight object to geno object
## Run snmf algorithm
#snmf1 <- snmf("gl_geno.geno",
# K = 1:10, # number of K ancestral populations to run
# repetitions = 50, # 50 repetitions for each K
# entropy = TRUE, # calculate cross-entropy
# project = "new")
snmf1 <- load.snmfProject("gl_geno.snmfProject")
## extract Q-matrix for the best run
plot(snmf1, col = "blue", cex = 1.5, pch = 19, las = 1)
## extract the cross-entropy of all runs where K = 9
ce <- cross.entropy(snmf1, K = 9)
ce
## K = 9
## run 1 0.3026330
## run 2 0.3006478
## run 3 0.2965800
## run 4 0.2934025
## run 5 0.2985019
## run 6 0.3038042
## run 7 0.3018757
## run 8 0.2907803
## run 9 0.2968350
## run 10 0.2961763
## run 11 0.2993841
## run 12 0.2929597
## run 13 0.2910719
## run 14 0.2818298
## run 15 0.2994594
## run 16 0.2894918
## run 17 0.2949548
## run 18 0.2914579
## run 19 0.2884607
## run 20 0.3000826
## run 21 0.2908541
## run 22 0.2981863
## run 23 0.2902612
## run 24 0.2904714
## run 25 0.3029123
## run 26 0.3011128
## run 27 0.2925949
## run 28 0.2941099
## run 29 0.3023984
## run 30 0.3020126
## run 31 0.2983558
## run 32 0.2997775
## run 33 0.2902648
## run 34 0.2993205
## run 35 0.2987137
## run 36 0.2952065
## run 37 0.2994274
## run 38 0.3042761
## run 39 0.2976461
## run 40 0.2993694
## run 41 0.2967565
## run 42 0.2944768
## run 43 0.2941358
## run 44 0.2882492
## run 45 0.2983567
## run 46 0.2942568
## run 47 0.2906855
## run 48 0.2999784
## run 49 0.3022837
## run 50 0.2979496
## find the run with the lowest cross-entropy
lowest.ce <- which.min(ce)
lowest.ce
## [1] 14
## extract Q-matrix for the best run
qmatrix <- as.data.frame(Q(snmf1, K = 9, run = lowest.ce))
head(qmatrix)
## V1 V2 V3 V4 V5 V6
## 1 9.99639e-05 9.99639e-05 0.25625200 9.99639e-05 0.000755320 0.613557
## 2 9.99730e-05 9.99730e-05 0.06442830 1.41056e-01 0.178439000 0.299589
## 3 9.99730e-05 7.78667e-02 0.24535100 2.57084e-02 0.000099973 0.328833
## 4 9.99730e-05 1.64043e-01 0.00834852 9.99730e-05 0.018455600 0.145086
## 5 1.32607e-02 4.37458e-02 0.59060000 9.99640e-05 0.000099964 0.184407
## 6 9.99640e-05 1.04134e-01 0.11331700 9.99640e-05 0.036707400 0.319622
## V7 V8 V9
## 1 9.99639e-05 0.049494200 0.079542400
## 2 1.66984e-02 0.000099973 0.299490000
## 3 9.99730e-05 0.188641000 0.133300000
## 4 9.99730e-05 0.516370000 0.147398000
## 5 9.99640e-05 0.167587000 0.000099964
## 6 9.99640e-05 0.425819000 0.000099964
## changing order of levels
pops <- fly_gen3$pop
levels(pops)
## [1] "Austria" "MI-WI" "Charlottesville" "Denmark"
## [5] "Esparto" "Finland" "France" "Germany"
## [9] "NY-MA" "PA" "Russia" "Spain"
## [13] "Tuolumne" "Turkey" "Ukraine"
levels(pops)[3] <- "Charlott."
qmplot <- cbind(qmatrix, pops)
qmplot$pops <- factor(qmplot$pops, levels = c("Esparto", "Tuolumne", "MI-WI", "Charlott.", "PA", "NY-MA", "Spain", "France", "Germany", "Austria", "Denmark", "Finland", "Ukraine", "Russia", "Turkey"))
qmplot <- qmplot[order(qmplot$pops), ]
pops <- qmplot$pops
## png("admixture_revision3.png", width = 7, height = 7, units = "in", res = 300)
## pdf("admixture_revision3.pdf")
layout(matrix(c(0, 1, 1, 0,
0, 1, 1, 0,
2, 2, 2, 2,
2, 2, 2, 2), nrow = 4, ncol = 4, byrow = TRUE))
par(mar = c(4, 4, 1, 0))
plot(snmf1, col = "blue", cex = 1.5, pch = 19, las = 1, cex.lab = 1.3)
Arrows(x = 9, y = 0.295, x1 = 9, y1 = 0.305, col = "red", arr.type = "triangle", code = 1, lwd = 1.5, arr.length = 0.2)
mtext("(a)", side = 2, at = 0.342, line = 2.8, font = 2, family = "serif", las = 1)
par(mar = c(5, 4.5, 1, 0))
barplot(t(qmplot[1:9]), col = RColorBrewer::brewer.pal(9,"Paired"), border = NA, space = 0, xlab = "", xaxt = "n", ylab = "Admixture proportion", las = 1, cex.lab = 1.4)
ticks <- c(0, 6, 11, 20, 26, 43, 49, 56, 66, 78, 84, 89, 95, 133, 139)
## adding population labels to the axis:
names <- unique(as.character(pops))
#medians <- c()
for(i in ticks){
#for(i in 1:length(pops)){
axis(1, at = i, labels = "")
#axis(1, at = median(which(pops == pops[i])), labels = "")
#medians <- c(medians, median(which(pops == pops[i])))
}
#medians[1:10]
text(x = c(3, 8, 15, 25, 29, 47, 53, 60, 73, 82, 87, 94, 101, 136, 143), y = par("usr")[3] - 0.05, labels = names, xpd = NA, srt = 35, cex = 1, adj = 1)
#text(x = as.numeric(unique(as.character(medians))), y = par("usr")[3] - 0.05, labels = names, xpd = NA, srt = 35, cex = 1, adj = 1.2)
mtext("Pools", side = 1, line = 4)
mtext("(b)", side = 2, at = 1.1, line = 2.4, font = 2, family = "serif", las = 1)
## dev.off()
Figure 2. Population structure analysis based on individual ancestry coefficients for a number of ancestral populations. (a) Cross-entropy values for each snmf run with k ranging between k = 1 and k = 10. The red arrow indicates the most likely value of k. (b) Admixture proportion across populations of D. melanogaster. Tick marks on the x axis correspond to the median number of pools per locality. Colors indicate genetic clusters.
## label column names of qmatrix
ncol(qmatrix)
## [1] 9
cluster_names = c()
for(i in 1:ncol(qmatrix)){
cluster_names[i] = paste("Cluster", i)
}
cluster_names
## [1] "Cluster 1" "Cluster 2" "Cluster 3" "Cluster 4" "Cluster 5" "Cluster 6"
## [7] "Cluster 7" "Cluster 8" "Cluster 9"
colnames(qmatrix) <- cluster_names
head(qmatrix)
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6
## 1 9.99639e-05 9.99639e-05 0.25625200 9.99639e-05 0.000755320 0.613557
## 2 9.99730e-05 9.99730e-05 0.06442830 1.41056e-01 0.178439000 0.299589
## 3 9.99730e-05 7.78667e-02 0.24535100 2.57084e-02 0.000099973 0.328833
## 4 9.99730e-05 1.64043e-01 0.00834852 9.99730e-05 0.018455600 0.145086
## 5 1.32607e-02 4.37458e-02 0.59060000 9.99640e-05 0.000099964 0.184407
## 6 9.99640e-05 1.04134e-01 0.11331700 9.99640e-05 0.036707400 0.319622
## Cluster 7 Cluster 8 Cluster 9
## 1 9.99639e-05 0.049494200 0.079542400
## 2 1.66984e-02 0.000099973 0.299490000
## 3 9.99730e-05 0.188641000 0.133300000
## 4 9.99730e-05 0.516370000 0.147398000
## 5 9.99640e-05 0.167587000 0.000099964
## 6 9.99640e-05 0.425819000 0.000099964
dim(qmatrix)
## [1] 151 9
## add individual IDs
qmatrix$Ind <- indNames(fly_gen3)
## add site IDs
qmatrix$Site <- fly_gen3$pop
head(qmatrix)
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6
## 1 9.99639e-05 9.99639e-05 0.25625200 9.99639e-05 0.000755320 0.613557
## 2 9.99730e-05 9.99730e-05 0.06442830 1.41056e-01 0.178439000 0.299589
## 3 9.99730e-05 7.78667e-02 0.24535100 2.57084e-02 0.000099973 0.328833
## 4 9.99730e-05 1.64043e-01 0.00834852 9.99730e-05 0.018455600 0.145086
## 5 1.32607e-02 4.37458e-02 0.59060000 9.99640e-05 0.000099964 0.184407
## 6 9.99640e-05 1.04134e-01 0.11331700 9.99640e-05 0.036707400 0.319622
## Cluster 7 Cluster 8 Cluster 9 Ind Site
## 1 9.99639e-05 0.049494200 0.079542400 AT_gr_12_fall Austria
## 2 1.66984e-02 0.000099973 0.299490000 AT_gr_12_spring Austria
## 3 9.99730e-05 0.188641000 0.133300000 AT_Mau_14_01 Austria
## 4 9.99730e-05 0.516370000 0.147398000 AT_Mau_14_02 Austria
## 5 9.99640e-05 0.167587000 0.000099964 AT_Mau_15_50 Austria
## 6 9.99640e-05 0.425819000 0.000099964 AT_Mau_15_51 Austria
## calculate mean admixture proportions for each site
clusters <- grep("Cluster", names(qmatrix)) ## indexes of cluster columns
avg_admix <- aggregate(qmatrix[, clusters], list(qmatrix$Site, qmatrix$Ind), mean)
colnames(avg_admix)[1:2] <- c("country", "Ind")
head(avg_admix)
## country Ind Cluster 1 Cluster 2 Cluster 3 Cluster 4
## 1 Austria AT_gr_12_fall 9.99639e-05 9.99639e-05 0.25625200 9.99639e-05
## 2 Austria AT_gr_12_spring 9.99730e-05 9.99730e-05 0.06442830 1.41056e-01
## 3 Austria AT_Mau_14_01 9.99730e-05 7.78667e-02 0.24535100 2.57084e-02
## 4 Austria AT_Mau_14_02 9.99730e-05 1.64043e-01 0.00834852 9.99730e-05
## 5 Austria AT_Mau_15_50 1.32607e-02 4.37458e-02 0.59060000 9.99640e-05
## 6 Austria AT_Mau_15_51 9.99640e-05 1.04134e-01 0.11331700 9.99640e-05
## Cluster 5 Cluster 6 Cluster 7 Cluster 8 Cluster 9
## 1 0.000755320 0.613557 9.99639e-05 0.049494200 0.079542400
## 2 0.178439000 0.299589 1.66984e-02 0.000099973 0.299490000
## 3 0.000099973 0.328833 9.99730e-05 0.188641000 0.133300000
## 4 0.018455600 0.145086 9.99730e-05 0.516370000 0.147398000
## 5 0.000099964 0.184407 9.99640e-05 0.167587000 0.000099964
## 6 0.036707400 0.319622 9.99640e-05 0.425819000 0.000099964
str(avg_admix)
## 'data.frame': 151 obs. of 11 variables:
## $ country : Factor w/ 15 levels "Austria","MI-WI",..: 1 1 1 1 1 1 5 5 5 5 ...
## $ Ind : chr "AT_gr_12_fall" "AT_gr_12_spring" "AT_Mau_14_01" "AT_Mau_14_02" ...
## $ Cluster 1: num 0.0001 0.0001 0.0001 0.0001 0.0133 ...
## $ Cluster 2: num 0.0001 0.0001 0.0779 0.164 0.0437 ...
## $ Cluster 3: num 0.25625 0.06443 0.24535 0.00835 0.5906 ...
## $ Cluster 4: num 0.0001 0.1411 0.0257 0.0001 0.0001 ...
## $ Cluster 5: num 0.000755 0.178439 0.0001 0.018456 0.0001 ...
## $ Cluster 6: num 0.614 0.3 0.329 0.145 0.184 ...
## $ Cluster 7: num 0.0001 0.0167 0.0001 0.0001 0.0001 ...
## $ Cluster 8: num 0.0495 0.0001 0.1886 0.5164 0.1676 ...
## $ Cluster 9: num 0.0795 0.2995 0.1333 0.1474 0.0001 ...
## import csv file containing coordinates
coor <- read.csv("coor.csv")
str(coor)
## 'data.frame': 16 obs. of 3 variables:
## $ country: chr "Austria" "Esparto" "Tuolumne" "Germany" ...
## $ long : num 15.96 -122.05 -120.26 9.71 10.21 ...
## $ lat : num 48.3 38.7 38 48.2 55.9 ...
head(coor)
## country long lat
## 1 Austria 15.965000 48.28750
## 2 Esparto -122.046000 38.68000
## 3 Tuolumne -120.260000 37.97000
## 4 Germany 9.714757 48.19901
## 5 Denmark 10.212586 55.94513
## 6 Spain 1.279236 41.51821
admix <- merge(coor, avg_admix, by = "country")
str(admix)
## 'data.frame': 151 obs. of 13 variables:
## $ country : chr "Austria" "Austria" "Austria" "Austria" ...
## $ long : num 16 16 16 16 16 ...
## $ lat : num 48.3 48.3 48.3 48.3 48.3 ...
## $ Ind : chr "AT_gr_12_fall" "AT_gr_12_spring" "AT_Mau_14_01" "AT_Mau_14_02" ...
## $ Cluster 1: num 0.0001 0.0001 0.0001 0.0001 0.0133 ...
## $ Cluster 2: num 0.0001 0.0001 0.0779 0.164 0.0437 ...
## $ Cluster 3: num 0.25625 0.06443 0.24535 0.00835 0.5906 ...
## $ Cluster 4: num 0.0001 0.1411 0.0257 0.0001 0.0001 ...
## $ Cluster 5: num 0.000755 0.178439 0.0001 0.018456 0.0001 ...
## $ Cluster 6: num 0.614 0.3 0.329 0.145 0.184 ...
## $ Cluster 7: num 0.0001 0.0167 0.0001 0.0001 0.0001 ...
## $ Cluster 8: num 0.0495 0.0001 0.1886 0.5164 0.1676 ...
## $ Cluster 9: num 0.0795 0.2995 0.1333 0.1474 0.0001 ...
head(admix)
## country long lat Ind Cluster 1 Cluster 2 Cluster 3
## 1 Austria 15.965 48.2875 AT_gr_12_fall 9.99639e-05 9.99639e-05 0.25625200
## 2 Austria 15.965 48.2875 AT_gr_12_spring 9.99730e-05 9.99730e-05 0.06442830
## 3 Austria 15.965 48.2875 AT_Mau_14_01 9.99730e-05 7.78667e-02 0.24535100
## 4 Austria 15.965 48.2875 AT_Mau_14_02 9.99730e-05 1.64043e-01 0.00834852
## 5 Austria 15.965 48.2875 AT_Mau_15_50 1.32607e-02 4.37458e-02 0.59060000
## 6 Austria 15.965 48.2875 AT_Mau_15_51 9.99640e-05 1.04134e-01 0.11331700
## Cluster 4 Cluster 5 Cluster 6 Cluster 7 Cluster 8 Cluster 9
## 1 9.99639e-05 0.000755320 0.613557 9.99639e-05 0.049494200 0.079542400
## 2 1.41056e-01 0.178439000 0.299589 1.66984e-02 0.000099973 0.299490000
## 3 2.57084e-02 0.000099973 0.328833 9.99730e-05 0.188641000 0.133300000
## 4 9.99730e-05 0.018455600 0.145086 9.99730e-05 0.516370000 0.147398000
## 5 9.99640e-05 0.000099964 0.184407 9.99640e-05 0.167587000 0.000099964
## 6 9.99640e-05 0.036707400 0.319622 9.99640e-05 0.425819000 0.000099964
colnames(admix)[c(5, 6, 7, 8, 9, 10, 11, 12, 13)] <- c("cluster1", "cluster2", "cluster3", "cluster4", "cluster5", "cluster6", "cluster7", "cluster8", "cluster9")
pies <- admix[ , c(1:3, 5:13)]
meanpies <- aggregate(pies[2:12], by = list(pies$country), mean)
Plotting admixture proportions on a map
## Plotting map
## png("map_revision.png", width = 7, height = 7, units = "in", res = 300)
## pdf("map_revision.pdf")
quartz()
layout(matrix(c(1, 1, 1, 1,
1, 1, 1, 1,
2, 2, 2, 2,
2, 2, 2, 2), nrow = 4, ncol = 4, byrow = TRUE))
map("state", fill = TRUE, col = "beige", border = "gray", asp = 1)
map.axes(cex.axis = 1, las = 1)
mtext(side = 1, line = 3, "Longitude")
mtext(side = 2, line = 3, "Latitude")
addCompass(-120, 29, rot = 0, useWest = TRUE, cex = 0.5, col.compass = "gainsboro")
## Adding pie charts to the map
for(i in 1:nrow(meanpies)){
add.pie(z = c(meanpies$cluster1[i], meanpies$cluster2[i], meanpies$cluster3[i], meanpies$cluster4[i], meanpies$cluster5[i], meanpies$cluster6[i], meanpies$cluster7[i], meanpies$cluster8[i], meanpies$cluster9[i]), x = meanpies$long[i], y = meanpies$lat[i], radius = 1.5, col = RColorBrewer::brewer.pal(9,"Paired"), labels = "", border = FALSE)
}
newmap <- getMap(resolution = "low")
plot(newmap, col = "beige", border = "gray", xlim = c(-10, 40), ylim = c(35, 71), asp = 1)
map.axes(cex.axis = 1, las = 1)
mtext(side = 1, line = 3, "Longitude")
mtext(side = 2, line = 3, "Latitude")
addCompass(-16, 40, rot = 0, useWest = TRUE, cex = 0.5, col.compass = "gainsboro")
## Adding pie charts to the map
for(i in 1:nrow(meanpies)){
add.pie(z = c(meanpies$cluster1[i], meanpies$cluster2[i], meanpies$cluster3[i], meanpies$cluster4[i], meanpies$cluster5[i], meanpies$cluster6[i], meanpies$cluster7[i], meanpies$cluster8[i], meanpies$cluster9[i]), x = meanpies$long[i], y = meanpies$lat[i], radius = 2.5, col = RColorBrewer::brewer.pal(9,"Paired"), labels = "", border = FALSE)
}
## dev.off()
Figure 3. Mean admixture proportions across populations of D. melanogaster surveyed in America (upper panel) and Europe (lower panel). Colors in the pie charts represent genetic clusters.
Isolation by Environment and Isolation by Distance analyses
unique(fly_gen3$pop)
## [1] Austria Esparto Tuolumne Germany
## [5] Denmark Spain Finland France
## [9] NY-MA MI-WI PA Russia
## [13] Turkey Ukraine Charlottesville
## 15 Levels: Austria MI-WI Charlottesville Denmark Esparto Finland ... Ukraine
obj <- matrix(NA, nrow = length(unique(fly_gen3$pop)), ncol = 3, byrow = TRUE)
idx <- 1
for(i in unique(fly_gen3$pop)){
long <- admix$long[admix$country == i][1]
lat <- admix$lat[admix$country == i][1]
country <- i
obj[idx, ] <- c(country, long, lat)
idx = idx + 1
}
obj
## [,1] [,2] [,3]
## [1,] "Austria" "15.965" "48.2875"
## [2,] "Esparto" "-122.046" "38.68"
## [3,] "Tuolumne" "-120.26" "37.97"
## [4,] "Germany" "9.7147565" "48.199012"
## [5,] "Denmark" "10.212586" "55.945128"
## [6,] "Spain" "1.279236" "41.518208"
## [7,] "Finland" "23.52" "61.1"
## [8,] "France" "3.5442054" "46.8656662"
## [9,] "NY-MA" "-74.085" "42.445"
## [10,] "MI-WI" "-88.04915" "42.61055"
## [11,] "PA" "-76.6350005" "40.3366975"
## [12,] "Russia" "33.235277" "58.02444"
## [13,] "Turkey" "32.260328" "40.231444"
## [14,] "Ukraine" "30.16599445" "49.47387778"
## [15,] "Charlottesville" "-78.47" "38"
obj <- as.data.frame(obj)
colnames(obj) <- c("country", "x", "y")
obj$x <- as.numeric(obj$x)
obj$y <- as.numeric(obj$y)
rownames(obj) <- obj[ , 1]
obj <- obj[ , c(2, 3)]
str(obj)
## 'data.frame': 15 obs. of 2 variables:
## $ x: num 15.96 -122.05 -120.26 9.71 10.21 ...
## $ y: num 48.3 38.7 38 48.2 55.9 ...
fly_gen3@other <- obj[ , c(1, 2)]
fly_gen3
## /// GENIND OBJECT /////////
##
## // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
##
## // Basic content
## @tab: 151 x 3100 matrix of allele counts
## @loc.n.all: number of alleles per locus (range: 2-2)
## @loc.fac: locus factor for the 3100 columns of @tab
## @all.names: list of allele names for each locus
## @ploidy: ploidy of each individual (range: 2-2)
## @type: codom
## @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
##
## // Optional content
## @pop: population of each individual (group size range: 5-38)
## @other: a list containing: x y
## genetic distance
GD.pop.PairwiseFst.hierfstat <- as.dist(hierfstat::pairwise.neifst(hierfstat::genind2hierfstat(fly_gen3)))
GD <- as.matrix(GD.pop.PairwiseFst.hierfstat)
USA <- c("MI-WI", "Charlottesville", "Esparto", "NY-MA", "PA", "Tuolumne")
GD <- as.dist(GD[!rownames(GD) %in% USA, !colnames(GD) %in% USA])
GD
## Austria Denmark Finland France Germany Russia Spain Turkey
## Denmark 0.0181
## Finland 0.0151 0.0246
## France 0.0212 0.0186 0.0222
## Germany 0.0128 0.0170 0.0136 0.0100
## Russia 0.0169 0.0234 0.0192 0.0222 0.0210
## Spain 0.0165 0.0256 0.0200 0.0158 0.0077 0.0318
## Turkey 0.0139 0.0217 0.0163 0.0280 0.0188 0.0167 0.0264
## Ukraine 0.0137 0.0234 0.0214 0.0281 0.0196 0.0067 0.0308 0.0129
## geographic distance
Dgeo <- round(dist(fly_gen3$other), 3)
Dgeo <- as.matrix(Dgeo)
Dgeo <- as.dist(Dgeo[!rownames(Dgeo) %in% USA, !colnames(Dgeo) %in% USA])
## making sure the order of rows and columns are the same in all of the matrices
ord.gen <- as.matrix(GD)
ord.geo <- as.matrix(Dgeo)
ord.geo
## Austria Germany Denmark Spain Finland France Russia Turkey Ukraine
## Austria 0.000 6.251 9.578 16.171 14.874 12.502 19.826 18.178 14.250
## Germany 6.251 0.000 7.762 10.761 18.895 6.313 25.490 23.912 20.491
## Denmark 9.578 7.762 0.000 16.969 14.271 11.265 23.116 27.074 20.977
## Spain 16.171 10.761 16.969 0.000 29.633 5.807 35.967 31.008 29.962
## Finland 14.874 18.895 14.271 29.633 0.000 24.529 10.190 22.625 13.392
## France 12.502 6.313 11.265 5.807 24.529 0.000 31.719 29.473 26.749
## Russia 19.826 25.490 23.116 35.967 10.190 31.719 0.000 17.820 9.085
## Turkey 18.178 23.912 27.074 31.008 22.625 29.473 17.820 0.000 9.477
## Ukraine 14.250 20.491 20.977 29.962 13.392 26.749 9.085 9.477 0.000
ord.gen <- ord.gen[rownames(ord.geo), colnames(ord.geo)]
ord.gen
## Austria Germany Denmark Spain Finland France Russia Turkey Ukraine
## Austria 0.0000 0.0128 0.0181 0.0165 0.0151 0.0212 0.0169 0.0139 0.0137
## Germany 0.0128 0.0000 0.0170 0.0077 0.0136 0.0100 0.0210 0.0188 0.0196
## Denmark 0.0181 0.0170 0.0000 0.0256 0.0246 0.0186 0.0234 0.0217 0.0234
## Spain 0.0165 0.0077 0.0256 0.0000 0.0200 0.0158 0.0318 0.0264 0.0308
## Finland 0.0151 0.0136 0.0246 0.0200 0.0000 0.0222 0.0192 0.0163 0.0214
## France 0.0212 0.0100 0.0186 0.0158 0.0222 0.0000 0.0222 0.0280 0.0281
## Russia 0.0169 0.0210 0.0234 0.0318 0.0192 0.0222 0.0000 0.0167 0.0067
## Turkey 0.0139 0.0188 0.0217 0.0264 0.0163 0.0280 0.0167 0.0000 0.0129
## Ukraine 0.0137 0.0196 0.0234 0.0308 0.0214 0.0281 0.0067 0.0129 0.0000
fst.dist <- round(as.dist(ord.gen), 3)
fst.dist
## Austria Germany Denmark Spain Finland France Russia Turkey
## Germany 0.013
## Denmark 0.018 0.017
## Spain 0.016 0.008 0.026
## Finland 0.015 0.014 0.025 0.020
## France 0.021 0.010 0.019 0.016 0.022
## Russia 0.017 0.021 0.023 0.032 0.019 0.022
## Turkey 0.014 0.019 0.022 0.026 0.016 0.028 0.017
## Ukraine 0.014 0.020 0.023 0.031 0.021 0.028 0.007 0.013
Dgeo
## Austria Germany Denmark Spain Finland France Russia Turkey
## Germany 6.251
## Denmark 9.578 7.762
## Spain 16.171 10.761 16.969
## Finland 14.874 18.895 14.271 29.633
## France 12.502 6.313 11.265 5.807 24.529
## Russia 19.826 25.490 23.116 35.967 10.190 31.719
## Turkey 18.178 23.912 27.074 31.008 22.625 29.473 17.820
## Ukraine 14.250 20.491 20.977 29.962 13.392 26.749 9.085 9.477
obj
## x y
## Austria 15.965000 48.28750
## Esparto -122.046000 38.68000
## Tuolumne -120.260000 37.97000
## Germany 9.714757 48.19901
## Denmark 10.212586 55.94513
## Spain 1.279236 41.51821
## Finland 23.520000 61.10000
## France 3.544205 46.86567
## NY-MA -74.085000 42.44500
## MI-WI -88.049150 42.61055
## PA -76.635001 40.33670
## Russia 33.235277 58.02444
## Turkey 32.260328 40.23144
## Ukraine 30.165994 49.47388
## Charlottesville -78.470000 38.00000
dprec <- obj[!rownames(obj) %in% USA, ]
dprec
## x y
## Austria 15.965000 48.28750
## Germany 9.714757 48.19901
## Denmark 10.212586 55.94513
## Spain 1.279236 41.51821
## Finland 23.520000 61.10000
## France 3.544205 46.86567
## Russia 33.235277 58.02444
## Turkey 32.260328 40.23144
## Ukraine 30.165994 49.47388
files <- list.files("/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio", full.names = TRUE)
files
## [1] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_10.tif"
## [2] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_11.tif"
## [3] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_14.tif"
## [4] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_15.tif"
## [5] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_16.tif"
## [6] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_18.tif"
## [7] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_19.tif"
## [8] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_2.tif"
## [9] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_3.tif"
## [10] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_4.tif"
## [11] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_5.tif"
## [12] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_6.tif"
## [13] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_8.tif"
## [14] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_9.tif"
pre_sea <- stack(files[4])
coord <- dprec[ , c(1, 2)]
head(coord)
## x y
## Austria 15.965000 48.28750
## Germany 9.714757 48.19901
## Denmark 10.212586 55.94513
## Spain 1.279236 41.51821
## Finland 23.520000 61.10000
## France 3.544205 46.86567
prec.season <- extract(pre_sea, coord)
colnames(prec.season) <- c("precseason")
rownames(prec.season) <- rownames(coord)
head(prec.season)
## precseason
## Austria 37.24262
## Germany 30.28178
## Denmark 21.50687
## Spain 25.15103
## Finland 33.33337
## France 14.22891
prec <- round(dist(prec.season), 3)
prec
## Austria Germany Denmark Spain Finland France Russia Turkey
## Germany 6.961
## Denmark 15.736 8.775
## Spain 12.092 5.131 3.644
## Finland 3.909 3.052 11.826 8.182
## France 23.014 16.053 7.278 10.922 19.104
## Russia 7.163 0.202 8.572 4.928 3.254 15.850
## Turkey 4.781 11.741 20.516 16.872 8.690 27.794 11.944
## Ukraine 1.902 5.059 13.833 10.189 2.007 21.111 5.261 6.683
## R script for 'MMRR' function that performs Multiple Matrix Regression with Randomization analysis.
source("MMRR.R")
distances <- list(Dgeo = as.matrix(Dgeo), dprec = as.matrix(prec))
MMRR(as.matrix(fst.dist), distances)
## $r.squared
## r.squared
## 0.5393563
##
## $coefficients
## Intercept Dgeo dprec
## 0.0089133784 0.0004727619 0.0001568212
##
## $tstatistic
## Intercept(t) Dgeo(t) dprec(t)
## 4.886716 5.430338 1.433680
##
## $tpvalue
## Intercept(p) Dgeo(p) dprec(p)
## 0.999 0.001 0.213
##
## $Fstatistic
## F-statistic
## 19.31944
##
## $Fpvalue
## F p-value
## 0.001
## partial mantel test
part.mant <- mantel.partial(fst.dist, prec, Dgeo, method = "pearson", permutations = 1000)
part.mant
##
## Partial Mantel statistic based on Pearson's product-moment correlation
##
## Call:
## mantel.partial(xdis = fst.dist, ydis = prec, zdis = Dgeo, method = "pearson", permutations = 1000)
##
## Mantel statistic r: 0.2421
## Significance: 0.086913
##
## Upper quantiles of permutations (null model):
## 90% 95% 97.5% 99%
## 0.223 0.294 0.340 0.402
## Permutation: free
## Number of permutations: 1000
part.mant2 <- mantel.partial(fst.dist, Dgeo, prec, method = "pearson", permutations = 1000)
part.mant2
##
## Partial Mantel statistic based on Pearson's product-moment correlation
##
## Call:
## mantel.partial(xdis = fst.dist, ydis = Dgeo, zdis = prec, method = "pearson", permutations = 1000)
##
## Mantel statistic r: 0.687
## Significance: 0.000999
##
## Upper quantiles of permutations (null model):
## 90% 95% 97.5% 99%
## 0.229 0.302 0.365 0.421
## Permutation: free
## Number of permutations: 1000
## mantel test
ibd <- mantel.randtest(fst.dist, Dgeo)
ibd
## Monte-Carlo test
## Call: mantel.randtest(m1 = fst.dist, m2 = Dgeo)
##
## Observation: 0.714608
##
## Based on 999 replicates
## Simulated p-value: 0.001
## Alternative hypothesis: greater
##
## Std.Obs Expectation Variance
## 4.036339334 -0.001721128 0.031495598
quartz()
plot(ibd, las = 1, main = "") ## isolation by distance is clearly significant
mantCor <- mgram(fst.dist, Dgeo, nperm = 1000)
mantCor
## $mgram
## lag ngroup mantelr pval llim ulim
## [1,] 2.513333 0 0.00000000 1.000 0.00000000 0.0000000
## [2,] 7.540000 7 0.48955228 0.006 0.38378455 0.6432192
## [3,] 12.566667 8 0.13723874 0.427 -0.07570513 0.2660222
## [4,] 17.593333 6 0.14671721 0.402 0.01044418 0.3113308
## [5,] 22.620000 6 -0.09568514 0.596 -0.22013509 0.0264787
## [6,] 27.646667 6 -0.44015164 0.005 -0.63975763 -0.2105894
##
## $resids
## [1] NA
##
## attr(,"class")
## [1] "mgram"
mantCor2 <- mgram(fst.dist, prec, nperm = 1000)
mantCor2
## $mgram
## lag ngroup mantelr pval llim ulim
## [1,] 2.299333 7 0.05706438 0.749 -0.08101704 1.885557e-01
## [2,] 6.898000 13 0.30933846 0.058 0.10874731 5.018355e-01
## [3,] 11.496667 6 -0.10844316 0.540 -0.30876377 1.197642e-01
## [4,] 16.095333 5 -0.03780823 0.848 -0.21908220 1.374838e-01
## [5,] 20.694000 3 -0.24514145 0.156 -0.40650049 -1.204746e-02
## [6,] 25.292667 1 -0.05063182 0.801 -0.11499191 5.459326e-17
##
## $resids
## [1] NA
##
## attr(,"class")
## [1] "mgram"
plot(mantCor, las = 1)
plot(mantCor2, las = 1)
## filled dots indicate significant correlations. The Mantel correlogram shows that genotypes are relatively similar at short distance, while this similarity decreases with distance. The negative correlations indicate genetic differenciation between some populations
dens <- MASS::kde2d(as.vector(Dgeo), as.vector(fst.dist), n = 300)
myPal <- colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(as.vector(Dgeo), as.vector(fst.dist), pch = 20, xlab = "Geographic Distance", ylab = "Genetic Distance", las = 1)
image(dens, col = transp(myPal(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(Dgeo)), lwd = 2)
#lines(loess.smooth(as.vector(Dgeo), as.vector(fst.dist)), col = "red", lwd = 2)
dens2 <- MASS::kde2d(as.vector(prec), as.vector(fst.dist), n = 300)
myPal2 <- colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(as.vector(prec), as.vector(fst.dist), pch = 20, xlab = "Precipitation distance", ylab = "Genetic Distance", las = 1)
image(dens2, col = transp(myPal2(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(prec)), lwd = 2)
#lines(loess.smooth(as.vector(prec), as.vector(fst.dist)), col = "red", lwd = 2)
quartz()
layout(matrix(c(1, 1, 2, 2,
1, 1, 2, 2,
3, 3, 4, 4,
3, 3, 4, 4), nrow = 4, ncol = 4, byrow = TRUE))
plot(lm(as.vector(fst.dist) ~ as.vector(prec)), las = 1)
quartz()
layout(matrix(c(1, 1, 2, 2,
1, 1, 2, 2,
3, 3, 4, 4,
3, 3, 4, 4), nrow = 4, ncol = 4, byrow = TRUE))
plot(lm(as.vector(fst.dist) ~ as.vector(Dgeo)), las = 1)
## png("distance.revision2.png", width = 7, height = 7, units = "in", res = 300)
## pdf("distance.revision2.pdf")
layout(matrix(c(0, 1, 1, 0,
0, 1, 1, 0,
0, 2, 2, 0,
0, 2, 2, 0), nrow = 4, ncol = 4, byrow = TRUE))
plot(as.vector(Dgeo), as.vector(fst.dist), pch = 20, xlab = "Geographic Distance", ylab = "Genetic Distance", las = 1, cex.lab = 1.4)
image(dens, col = transp(myPal(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(Dgeo)), lwd = 2)
#lines(loess.smooth(as.vector(Dgeo), as.vector(fst.dist)), col = "red", lwd = 2)
mtext("(a)", side = 2, at = 0.035, line = 2.6, font = 2, family = "serif", las = 1)
plot(as.vector(prec), as.vector(fst.dist), pch = 20, xlab = "Precipitation distance", ylab = "Genetic Distance", las = 1, cex.lab = 1.4)
image(dens2, col = transp(myPal2(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(prec)), lwd = 2)
#lines(loess.smooth(as.vector(prec), as.vector(fst.dist)), col = "red", lwd = 2)
mtext("(b)", side = 2, at = 0.035, line = 3.1, font = 2, family = "serif", las = 1)
## dev.off()
Figure 6. Patterns of isolation by distance and isolation by environment according to Multiple Matrix Regression with Randomization analysis (MMRR). (a-b) Geographic and precipitation distances as a function of genetic distance in European populations of D. melanogaster. Colors represent estimated probability densities.
Analysis of seasonlity and adaptation test
## extracting environmental variables from WorldClim and other sources
mat_adap <- admix[ , c(1, 2, 3, 4)]
head(mat_adap)
## country long lat Ind
## 1 Austria 15.965 48.2875 AT_gr_12_fall
## 2 Austria 15.965 48.2875 AT_gr_12_spring
## 3 Austria 15.965 48.2875 AT_Mau_14_01
## 4 Austria 15.965 48.2875 AT_Mau_14_02
## 5 Austria 15.965 48.2875 AT_Mau_15_50
## 6 Austria 15.965 48.2875 AT_Mau_15_51
mat_adap$season <- rep(NA, nrow(mat_adap))
idx <- 1
for(i in mat_adap$Ind){
obj <- unique(afis$season[afis$sampleId == i])
mat_adap[idx, 5] <- obj
idx = idx + 1
}
mat_adap$season[mat_adap$season == "frost"] <- "fall"
head(mat_adap)
## country long lat Ind season
## 1 Austria 15.965 48.2875 AT_gr_12_fall fall
## 2 Austria 15.965 48.2875 AT_gr_12_spring spring
## 3 Austria 15.965 48.2875 AT_Mau_14_01 spring
## 4 Austria 15.965 48.2875 AT_Mau_14_02 fall
## 5 Austria 15.965 48.2875 AT_Mau_15_50 spring
## 6 Austria 15.965 48.2875 AT_Mau_15_51 fall
## extracting NPP
NPP_file <- list.files("/Users/dpadil10/Dropbox (ASU)/npp-geotiff", full.names = TRUE)
NPP_file
## [1] "/Users/dpadil10/Dropbox (ASU)/npp-geotiff/npp_geotiff.tif"
NPP.data <- stack(NPP_file)
prod <- mat_adap[ , c(2, 3)] ## loading the dataframe with the coordinates
head(prod)
## long lat
## 1 15.965 48.2875
## 2 15.965 48.2875
## 3 15.965 48.2875
## 4 15.965 48.2875
## 5 15.965 48.2875
## 6 15.965 48.2875
NPP.data.ext <- extract(NPP.data, prod)
head(NPP.data.ext)
## npp_geotiff
## [1,] 264522317824
## [2,] 264522317824
## [3,] 264522317824
## [4,] 264522317824
## [5,] 264522317824
## [6,] 264522317824
NPP.data.ext <- as.data.frame(NPP.data.ext)
colnames(NPP.data.ext) <- "NPP"
head(NPP.data.ext)
## NPP
## 1 264522317824
## 2 264522317824
## 3 264522317824
## 4 264522317824
## 5 264522317824
## 6 264522317824
##is.na(NPP.data.ext)
## extracting BIO15 precipitation seasonality (coefficient of variation), BIO4 temperature seasonality (standard deviation * 100)
files <- list.files("/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio", full.names = TRUE)
files
## [1] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_10.tif"
## [2] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_11.tif"
## [3] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_14.tif"
## [4] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_15.tif"
## [5] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_16.tif"
## [6] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_18.tif"
## [7] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_19.tif"
## [8] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_2.tif"
## [9] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_3.tif"
## [10] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_4.tif"
## [11] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_5.tif"
## [12] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_6.tif"
## [13] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_8.tif"
## [14] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_9.tif"
pre_sea <- stack(files[4])
tem_sea <- stack(files[10])
coord <- mat_adap[ , c(2, 3)]
head(coord)
## long lat
## 1 15.965 48.2875
## 2 15.965 48.2875
## 3 15.965 48.2875
## 4 15.965 48.2875
## 5 15.965 48.2875
## 6 15.965 48.2875
prec.season <- extract(pre_sea, coord)
temp.season <- extract(tem_sea, coord)
variables <- cbind(prec.season, temp.season, log(NPP.data.ext))
head(variables)
## wc2.1_30s_bio_15 wc2.1_30s_bio_4 NPP
## 1 37.24262 744.4373 26.30119
## 2 37.24262 744.4373 26.30119
## 3 37.24262 744.4373 26.30119
## 4 37.24262 744.4373 26.30119
## 5 37.24262 744.4373 26.30119
## 6 37.24262 744.4373 26.30119
colnames(variables) <- c("precseason", "tempseason", "logNPP")
head(variables)
## precseason tempseason logNPP
## 1 37.24262 744.4373 26.30119
## 2 37.24262 744.4373 26.30119
## 3 37.24262 744.4373 26.30119
## 4 37.24262 744.4373 26.30119
## 5 37.24262 744.4373 26.30119
## 6 37.24262 744.4373 26.30119
env <- cbind(mat_adap, variables)
head(env)
## country long lat Ind season precseason tempseason logNPP
## 1 Austria 15.965 48.2875 AT_gr_12_fall fall 37.24262 744.4373 26.30119
## 2 Austria 15.965 48.2875 AT_gr_12_spring spring 37.24262 744.4373 26.30119
## 3 Austria 15.965 48.2875 AT_Mau_14_01 spring 37.24262 744.4373 26.30119
## 4 Austria 15.965 48.2875 AT_Mau_14_02 fall 37.24262 744.4373 26.30119
## 5 Austria 15.965 48.2875 AT_Mau_15_50 spring 37.24262 744.4373 26.30119
## 6 Austria 15.965 48.2875 AT_Mau_15_51 fall 37.24262 744.4373 26.30119
## PCA of environmental variables
x <- tab(fly_gen3, NA.method = "mean")
snppca <- as.data.frame(x)
snppca$Ind <- rownames(x)
snppca$country <- fly_gen3$pop
dim(snppca)
## [1] 151 3102
gen <- snppca[order(snppca$country), ]
merging <- merge(gen, env, by = "Ind")
merging[1:5, 1:5]
## Ind 162333.A 162333.C 162335.C 162335.T
## 1 AT_gr_12_fall 1 1 1 1
## 2 AT_gr_12_spring 1 1 1 1
## 3 AT_Mau_14_01 0 2 1 1
## 4 AT_Mau_14_02 1 1 1 1
## 5 AT_Mau_15_50 1 1 1 1
rownames(merging) <- merging[ , 1]
dim(merging)
## [1] 151 3109
merging[1:5, 3101:3109]
## 163997.T country.x country.y long lat season precseason
## AT_gr_12_fall 0 Austria Austria 15.965 48.2875 fall 37.24262
## AT_gr_12_spring 0 Austria Austria 15.965 48.2875 spring 37.24262
## AT_Mau_14_01 0 Austria Austria 15.965 48.2875 spring 37.24262
## AT_Mau_14_02 1 Austria Austria 15.965 48.2875 fall 37.24262
## AT_Mau_15_50 0 Austria Austria 15.965 48.2875 spring 37.24262
## tempseason logNPP
## AT_gr_12_fall 744.4373 26.30119
## AT_gr_12_spring 744.4373 26.30119
## AT_Mau_14_01 744.4373 26.30119
## AT_Mau_14_02 744.4373 26.30119
## AT_Mau_15_50 744.4373 26.30119
merging <- merging[ , -c(1, 3102:3109)]
dim(merging)
## [1] 151 3100
merging[1:5, 3090:3100]
## 163992.G 163993.T 163993.C 163994.G 163994.A 163995.G 163995.A
## AT_gr_12_fall 0 1 1 1 1 2 0
## AT_gr_12_spring 0 1 1 1 1 2 0
## AT_Mau_14_01 1 1 1 1 1 2 0
## AT_Mau_14_02 0 1 1 1 1 2 0
## AT_Mau_15_50 1 1 1 1 1 2 0
## 163996.G 163996.C 163997.A 163997.T
## AT_gr_12_fall 1 1 2 0
## AT_gr_12_spring 1 1 2 0
## AT_Mau_14_01 1 1 2 0
## AT_Mau_14_02 1 1 1 1
## AT_Mau_15_50 1 1 2 0
genomic <- merging
#env$Ind <- as.factor(env$Ind)
#env$Ind <- as.character(levels(env$Ind))
dim(snppca)
## [1] 151 3102
snppca[1:5, 1:5]
## 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall 1 1 1 1 2
## AT_gr_12_spring 1 1 1 1 2
## AT_Mau_14_01 0 2 1 1 1
## AT_Mau_14_02 1 1 1 1 2
## AT_Mau_15_50 1 1 1 1 2
snppca[1:5, 3090:3102]
## 163992.G 163993.T 163993.C 163994.G 163994.A 163995.G 163995.A
## AT_gr_12_fall 0 1 1 1 1 2 0
## AT_gr_12_spring 0 1 1 1 1 2 0
## AT_Mau_14_01 1 1 1 1 1 2 0
## AT_Mau_14_02 0 1 1 1 1 2 0
## AT_Mau_15_50 1 1 1 1 1 2 0
## 163996.G 163996.C 163997.A 163997.T Ind country
## AT_gr_12_fall 1 1 2 0 AT_gr_12_fall Austria
## AT_gr_12_spring 1 1 2 0 AT_gr_12_spring Austria
## AT_Mau_14_01 1 1 2 0 AT_Mau_14_01 Austria
## AT_Mau_14_02 1 1 1 1 AT_Mau_14_02 Austria
## AT_Mau_15_50 1 1 2 0 AT_Mau_15_50 Austria
snppca <- snppca[ , -c(3101, 3102)]
dim(snppca)
## [1] 151 3100
snppca[1:5, 3090:3100]
## 163992.G 163993.T 163993.C 163994.G 163994.A 163995.G 163995.A
## AT_gr_12_fall 0 1 1 1 1 2 0
## AT_gr_12_spring 0 1 1 1 1 2 0
## AT_Mau_14_01 1 1 1 1 1 2 0
## AT_Mau_14_02 0 1 1 1 1 2 0
## AT_Mau_15_50 1 1 1 1 1 2 0
## 163996.G 163996.C 163997.A 163997.T
## AT_gr_12_fall 1 1 2 0
## AT_gr_12_spring 1 1 2 0
## AT_Mau_14_01 1 1 2 0
## AT_Mau_14_02 1 1 1 1
## AT_Mau_15_50 1 1 2 0
genomic <- genomic[env[ , 4], ]
identical(rownames(genomic), env[ , 4]) ## genotypes and environmental data are in the same order
## [1] TRUE
## How many seasons per country are there?
table(data.frame(season = env$season, country = env$country))
## country
## season Austria Charlottesville Denmark Esparto Finland France Germany MI-WI
## fall 3 3 4 3 2 4 6 5
## spring 3 3 1 2 4 6 6 4
## country
## season NY-MA PA Russia Spain Tuolumne Turkey Ukraine
## fall 3 9 3 4 4 4 14
## spring 3 8 3 3 1 9 24
xtable(table(data.frame(season = env$season, country = env$country)), caption = "caption here")
## % latex table generated in R 4.2.2 by xtable 1.8-4 package
## % Wed Nov 15 23:10:37 2023
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrrrrrrrrrrrr}
## \hline
## & Austria & Charlottesville & Denmark & Esparto & Finland & France & Germany & MI-WI & NY-MA & PA & Russia & Spain & Tuolumne & Turkey & Ukraine \\
## \hline
## fall & 3 & 3 & 4 & 3 & 2 & 4 & 6 & 5 & 3 & 9 & 3 & 4 & 4 & 4 & 14 \\
## spring & 3 & 3 & 1 & 2 & 4 & 6 & 6 & 4 & 3 & 8 & 3 & 3 & 1 & 9 & 24 \\
## \hline
## \end{tabular}
## \caption{caption here}
## \end{table}
names(env)
## [1] "country" "long" "lat" "Ind" "season"
## [6] "precseason" "tempseason" "logNPP"
## CCA
gen.cca <- cca(genomic ~ season*country, env)
gen.cca
## Call: cca(formula = genomic ~ season * country, data = env)
##
## Inertia Proportion Rank
## Total 0.21786 1.00000
## Constrained 0.07607 0.34917 29
## Unconstrained 0.14179 0.65083 121
## Inertia is scaled Chi-square
##
## Eigenvalues for constrained axes:
## CCA1 CCA2 CCA3 CCA4 CCA5 CCA6 CCA7 CCA8
## 0.027197 0.006577 0.003781 0.003062 0.002942 0.002595 0.002390 0.002168
## CCA9 CCA10 CCA11 CCA12 CCA13 CCA14 CCA15 CCA16
## 0.002039 0.001935 0.001810 0.001709 0.001580 0.001386 0.001334 0.001266
## CCA17 CCA18 CCA19 CCA20 CCA21 CCA22 CCA23 CCA24
## 0.001217 0.001095 0.001071 0.001057 0.001016 0.000947 0.000919 0.000913
## CCA25 CCA26 CCA27 CCA28 CCA29
## 0.000874 0.000861 0.000821 0.000788 0.000720
##
## Eigenvalues for unconstrained axes:
## CA1 CA2 CA3 CA4 CA5 CA6 CA7 CA8
## 0.006362 0.003602 0.003278 0.003023 0.002933 0.002655 0.002486 0.002442
## (Showing 8 of 121 unconstrained eigenvalues)
anova(gen.cca)
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
##
## Model: cca(formula = genomic ~ season * country, data = env)
## Df ChiSquare F Pr(>F)
## Model 29 0.076071 2.2385 0.001 ***
## Residual 121 0.141792
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
env$country <- as.factor(env$country)
levels(env$country)
## [1] "Austria" "Charlottesville" "Denmark" "Esparto"
## [5] "Finland" "France" "Germany" "MI-WI"
## [9] "NY-MA" "PA" "Russia" "Spain"
## [13] "Tuolumne" "Turkey" "Ukraine"
cols <- env$country
cols <- as.numeric(cols)
ncol <- nPop(fly_gen3)
palette <- distinctColorPalette(ncol)
colors <- rep()
for(i in cols){
col <- palette[i]
colors <- c(colors, col)
}
colors
## [1] "#BD4BDB" "#BD4BDB" "#BD4BDB" "#BD4BDB" "#BD4BDB" "#BD4BDB" "#E08851"
## [8] "#E08851" "#E08851" "#E08851" "#E08851" "#E08851" "#E2AAA2" "#E2AAA2"
## [15] "#E2AAA2" "#E2AAA2" "#E2AAA2" "#D49EDA" "#D49EDA" "#D49EDA" "#D49EDA"
## [22] "#D49EDA" "#DE609B" "#DE609B" "#DE609B" "#DE609B" "#DE609B" "#DE609B"
## [29] "#7776D8" "#7776D8" "#7776D8" "#7776D8" "#7776D8" "#7776D8" "#7776D8"
## [36] "#7776D8" "#7776D8" "#7776D8" "#85E2A0" "#85E2A0" "#85E2A0" "#85E2A0"
## [43] "#85E2A0" "#85E2A0" "#85E2A0" "#85E2A0" "#85E2A0" "#85E2A0" "#85E2A0"
## [50] "#85E2A0" "#D8CC6A" "#D8CC6A" "#D8CC6A" "#D8CC6A" "#D8CC6A" "#D8CC6A"
## [57] "#D8CC6A" "#D8CC6A" "#D8CC6A" "#757C7A" "#757C7A" "#757C7A" "#757C7A"
## [64] "#757C7A" "#757C7A" "#CFDEB5" "#CFDEB5" "#CFDEB5" "#CFDEB5" "#CFDEB5"
## [71] "#CFDEB5" "#CFDEB5" "#CFDEB5" "#CFDEB5" "#CFDEB5" "#CFDEB5" "#CFDEB5"
## [78] "#CFDEB5" "#CFDEB5" "#CFDEB5" "#CFDEB5" "#CFDEB5" "#7DB0DE" "#7DB0DE"
## [85] "#7DB0DE" "#7DB0DE" "#7DB0DE" "#7DB0DE" "#72E95E" "#72E95E" "#72E95E"
## [92] "#72E95E" "#72E95E" "#72E95E" "#72E95E" "#75DED6" "#75DED6" "#75DED6"
## [99] "#75DED6" "#75DED6" "#CCD6DF" "#CCD6DF" "#CCD6DF" "#CCD6DF" "#CCD6DF"
## [106] "#CCD6DF" "#CCD6DF" "#CCD6DF" "#CCD6DF" "#CCD6DF" "#CCD6DF" "#CCD6DF"
## [113] "#CCD6DF" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C"
## [120] "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C"
## [127] "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C"
## [134] "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C"
## [141] "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C"
## [148] "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C"
## png("cca_revision3.png", width = 7, height = 7, units = "in", res = 300)
## pdf("cca_revision3.pdf")
plot(gen.cca, display = "sites", type = "n", las = 1)
with(gen.cca, points(gen.cca, display = "sites", pch = 21, bg = colors, col = colors))
##with(env[3], ordiellipse(gen.cca, env$country, kind = "se", conf = 0.95, col = "gray"))
##with(env[3], ordihull(gen.cca, env$country, col = colors, lty = 2))
with(env[3], ordispider(gen.cca, env$country, col = unique(colors), label = TRUE))
##legend("topright", legend = levels(env$country), pch = 19, col = unique(cols), bty = "n", cex = 0.8)
## dev.off()
Figure 4. Constrained correspondence analysis depicting the difference in allele frequency across populations mediated by the interaction between season and locality. Filled dots represent pooled samples and “spider” diagrams connect the pools to the population centroid.
## Detecting selection
load("/Users/dpadil10/Dropbox (ASU)/foraging_mode_Drosophila/data_analysis/chr2-scan/fly.pv1.prec.chr2.RData")
load("/Users/dpadil10/Dropbox (ASU)/foraging_mode_Drosophila/data_analysis/chr2-scan/fly.pv1.temp.chr2.RData")
load("/Users/dpadil10/Dropbox (ASU)/foraging_mode_Drosophila/data_analysis/chr2-scan/fly.pv1.npp.chr2.RData")
fly.pv1.prec.chr2 <- fly.pv1.prec
fly.pv1.temp.chr2 <- fly.pv1.temp
fly.pv1.npp.chr2 <- fly.pv1.npp
rm(fly.pv1.prec)
rm(fly.pv1.temp)
rm(fly.pv1.npp)
head(env)
## country long lat Ind season precseason tempseason logNPP
## 1 Austria 15.965 48.2875 AT_gr_12_fall fall 37.24262 744.4373 26.30119
## 2 Austria 15.965 48.2875 AT_gr_12_spring spring 37.24262 744.4373 26.30119
## 3 Austria 15.965 48.2875 AT_Mau_14_01 spring 37.24262 744.4373 26.30119
## 4 Austria 15.965 48.2875 AT_Mau_14_02 fall 37.24262 744.4373 26.30119
## 5 Austria 15.965 48.2875 AT_Mau_15_50 spring 37.24262 744.4373 26.30119
## 6 Austria 15.965 48.2875 AT_Mau_15_51 fall 37.24262 744.4373 26.30119
##env.pca <- rda(env[ , c(2, 3, 6, 7, 8)], scale = T)
env.pca <- rda(env[ , c(6, 7, 8)], scale = TRUE)
summary(env.pca)$cont
## $importance
## Importance of components:
## PC1 PC2 PC3
## Eigenvalue 1.2227 0.9646 0.8127
## Proportion Explained 0.4076 0.3215 0.2709
## Cumulative Proportion 0.4076 0.7291 1.0000
par(las = 1)
screeplot(env.pca, bstick = TRUE, type = "lines", las = 1, main = "")
round(scores(env.pca, choices = 1:3, display = "species", scaling = 0), digits = 3)
## PC1 PC2 PC3
## precseason -0.655 0.273 -0.704
## tempseason 0.658 -0.251 -0.710
## logNPP 0.371 0.929 0.016
## attr(,"const")
## [1] 4.605779
#pred.PC2 <- scores(env.pca, choices = 2, display = "sites", scaling = 0)
##pred.PC1 <- scores(env.pca, choices = 1:2, display = "sites", scaling = 0) ## reviewer 3 suggested to use PC1 instead
K <- 9
fly.lfmm1.prec <- lfmm_ridge(Y = snppca, X = env$precseason, K = K) ## change K as you see fit
fly.lfmm1.temp <- lfmm_ridge(Y = snppca, X = env$tempseason, K = K) ## change K as you see fit
fly.lfmm1.npp <- lfmm_ridge(Y = snppca, X = env$logNPP, K = K) ## change K as you see fit
##fly.lfmm1 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K) ## using PC1 as suggested by reviewer 3
fly.pv1.prec <- lfmm_test(Y = snppca, X = env$precseason, lfmm = fly.lfmm1.prec, calibrate = "gif")
fly.pv1.temp <- lfmm_test(Y = snppca, X = env$tempseason, lfmm = fly.lfmm1.temp, calibrate = "gif")
fly.pv1.npp <- lfmm_test(Y = snppca, X = env$logNPP, lfmm = fly.lfmm1.npp, calibrate = "gif")
##fly.pv1 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm1, calibrate = "gif") ## using PC1 as suggested by reviewer 3
names(fly.pv1.prec) # this object includes raw z-scores and p-values, as well as GIF-calibrated scores and p-values
## [1] "B" "epsilon.sigma2" "B.sigma2"
## [4] "score" "pvalue" "gif"
## [7] "calibrated.score2" "calibrated.pvalue"
names(fly.pv1.prec.chr2)
## [1] "B" "epsilon.sigma2" "B.sigma2"
## [4] "score" "pvalue" "gif"
## [7] "calibrated.score2" "calibrated.pvalue"
## let’s look at the genomic inflation factor (GIF)
fly.pv1.prec$gif
## [1] 1.019627
fly.pv1.temp$gif
## [1] 2.071159
fly.pv1.npp$gif
## [1] 1.066583
fly.pv1.prec.chr2$gif
## [1] 1.294608
fly.pv1.temp.chr2$gif
## [1] 1.79341
fly.pv1.npp.chr2$gif
## [1] 1.191985
## an appropriately calibrated set of tests will have a GIF of around 1
## let’s look at how application of the GIF to the p-values impacts the p-value distribution
quartz()
layout(matrix(c(1, 1, 2, 2,
1, 1, 2, 2,
3, 3, 4, 4,
3, 3, 4, 4), nrow = 4, ncol = 4, byrow = FALSE))
par(mar = c(5, 5, 1, 1), las = 1)
hist(fly.pv1.prec$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "for", ylim = c(0, 350))
legend("top", "K = 9", bty = "n")
hist(fly.pv1.prec$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 400))
legend("top", "K = 9", bty = "n")
hist(fly.pv1.prec.chr2$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "Chr 2L", ylim = c(0, 120000))
legend("top", "K = 8", bty = "n")
hist(fly.pv1.prec.chr2$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 140000))
legend("top", "K = 8", bty = "n")
## convert the adjusted p-values to q-values. q-values provide a measure of each SNP’s significance, automatically taking into account the fact that thousands are simultaneously being tested. I can then use an FDR threshold to control the number of false positive detections (given that the p-value distribution is “well-behaved”)
fly.qv1.prec <- qvalue(fly.pv1.prec$calibrated.pvalue)$qvalues
fly.qv1.temp <- qvalue(fly.pv1.temp$calibrated.pvalue)$qvalues
fly.qv1.npp <- qvalue(fly.pv1.npp$calibrated.pvalue)$qvalues
length(which(fly.qv1.prec < 0.001)) ## how many SNPs have an FDR < 0.1%?
## [1] 6
length(which(fly.qv1.temp < 0.001)) ## how many SNPs have an FDR < 0.1%?
## [1] 0
length(which(fly.qv1.npp < 0.001)) ## how many SNPs have an FDR < 0.1%?
## [1] 0
fly.qv1.prec.chr2 <- qvalue(fly.pv1.prec.chr2$calibrated.pvalue)$qvalues
fly.qv1.temp.chr2 <- qvalue(fly.pv1.temp.chr2$calibrated.pvalue)$qvalues
fly.qv1.npp.chr2 <- qvalue(fly.pv1.npp.chr2$calibrated.pvalue)$qvalues
length(which(fly.qv1.prec.chr2 < 0.001)) ## how many SNPs have an FDR < 0.1%?
## [1] 104
length(which(fly.qv1.temp.chr2 < 0.001)) ## how many SNPs have an FDR < 0.1%?
## [1] 4
length(which(fly.qv1.npp.chr2 < 0.001)) ## how many SNPs have an FDR < 0.1%?
## [1] 32
## using K = 9, the default GIF correction, and an FDR threshold of 0.001, I detected 2 candidate SNPs under selection in response to the PC2 environmental predictor
(fly.FDR.prec <- colnames(snppca)[which(fly.qv1.prec < 0.001)]) ## identify which SNPs these are
## [1] "162369.T" "162369.C" "163496.G" "163496.A" "163504.G" "163504.A"
(fly.FDR.temp <- colnames(snppca)[which(fly.qv1.temp < 0.001)]) ## identify which SNPs these are
## character(0)
(fly.FDR.npp <- colnames(snppca)[which(fly.qv1.npp < 0.001)]) ## identify which SNPs these are
## character(0)
dim(snppca)
## [1] 151 3100
rm(snppca)
load("/Users/dpadil10/Dropbox (ASU)/foraging_mode_Drosophila/data_analysis/chr2-scan/snppca_chr2.RData")
dim(snppca)
## [1] 151 1862322
(fly.FDR.prec.chr2 <- colnames(snppca)[which(fly.qv1.prec.chr2 < 0.001)]) ## identify which SNPs these are
## [1] "4341.A" "4341.T" "20722.C" "20722.T" "24540.T" "24540.A"
## [7] "42028.C" "42028.T" "49396.C" "49396.G" "71603.T" "71603.C"
## [13] "79614.A" "79614.G" "143754.T" "143754.C" "144065.T" "144065.C"
## [19] "163496.G" "163496.A" "281854.G" "281854.C" "339746.G" "339746.A"
## [25] "358648.C" "358648.T" "359700.C" "359700.G" "359838.G" "359838.C"
## [31] "359851.G" "359851.A" "365020.C" "365020.T" "428037.C" "428037.T"
## [37] "435493.C" "435493.A" "436264.C" "436264.T" "460821.T" "460821.A"
## [43] "461800.G" "461800.A" "479192.C" "479192.G" "542516.T" "542516.C"
## [49] "549898.C" "549898.G" "576471.T" "576471.C" "592567.A" "592567.G"
## [55] "638134.C" "638134.A" "658660.G" "658660.A" "659375.A" "659375.G"
## [61] "693165.G" "693165.A" "693375.T" "693375.G" "707347.A" "707347.G"
## [67] "713129.A" "713129.G" "716215.G" "716215.A" "721348.A" "721348.T"
## [73] "726550.A" "726550.G" "737646.G" "737646.T" "766843.G" "766843.C"
## [79] "776548.G" "776548.A" "817451.T" "817451.A" "817454.A" "817454.G"
## [85] "835825.C" "835825.T" "844869.A" "844869.C" "845091.C" "845091.G"
## [91] "918074.G" "918074.A" "927644.T" "927644.C" "943283.G" "943283.T"
## [97] "952506.C" "952506.T" "967638.G" "967638.A" "973978.G" "973978.A"
## [103] "975445.G" "975445.A"
(fly.FDR.temp.chr2 <- colnames(snppca)[which(fly.qv1.temp.chr2 < 0.001)]) ## identify which SNPs these are
## [1] "555268.G" "555268.T" "571263.A" "571263.G"
(fly.FDR.temp.chr2 <- colnames(snppca)[which(fly.qv1.npp.chr2 < 0.001)]) ## identify which SNPs these are
## [1] "3205.G" "3205.A" "95522.T" "95522.C" "110380.A" "110380.T"
## [7] "178736.T" "178736.C" "224757.T" "224757.G" "249489.G" "249489.A"
## [13] "295197.T" "295197.C" "337949.C" "337949.T" "347923.A" "347923.G"
## [19] "425629.G" "425629.A" "568526.C" "568526.G" "590620.G" "590620.A"
## [25] "744599.C" "744599.T" "745747.C" "745747.T" "747288.T" "747288.C"
## [31] "858669.T" "858669.C"
## visualizing results
## Manhattan plot with causal loci
qvalues <- fly.qv1.prec
qvalues.chr2 <- fly.qv1.prec.chr2
row <- as.data.frame(qvalues)
str(row)
## 'data.frame': 3100 obs. of 1 variable:
## $ V1: num 0.604 0.604 0.999 0.999 0.999 ...
rownames(row) <- 1:nrow(row)
row.chr2 <- as.data.frame(qvalues.chr2)
str(row.chr2)
## 'data.frame': 1862322 obs. of 1 variable:
## $ V1: num 1 1 1 1 1 ...
rownames(row.chr2) <- 1:nrow(row.chr2)
(causal.set <- as.numeric(rownames(row)[which(row < 0.001)]))
## [1] 67 68 2161 2162 2175 2176
(fly.FDR.prec <- gsub("[^0-9]", "", fly.FDR.prec))
## [1] "162369" "162369" "163496" "163496" "163504" "163504"
head(afis[afis$variant.id == 162369, c(2, 6, 7, 8, 9, 11, 21, 32)], n = 2)
## variant.id col gene chr pos genotype type
## 1: 162369 upstream_gene_variant CG15418 2L 3623241 T,C pooled
## 2: 162369 upstream_gene_variant CG15418 2L 3623241 T,C pooled
## stationId
## 1: AUM00011389
## 2: AUM00011389
head(afis[afis$variant.id == 163496, c(2, 6, 7, 8, 9, 11, 21, 32)], n = 2)
## variant.id col gene chr pos genotype type stationId
## 1: 163496 intron_variant for 2L 3642321 G,A pooled AUM00011389
## 2: 163496 intron_variant for 2L 3642321 G,A pooled AUM00011389
head(afis[afis$variant.id == 163504, c(2, 6, 7, 8, 9, 11, 21, 32)], n = 2)
## variant.id col gene chr pos genotype type stationId
## 1: 163504 intron_variant for 2L 3642581 G,A pooled AUM00011389
## 2: 163504 intron_variant for 2L 3642581 G,A pooled AUM00011389
(causal.set.chr2 <- as.numeric(rownames(row.chr2)[which(row.chr2 < 0.001)]))
## [1] 8335 8336 39089 39090 45925 45926 77569 77570 91269
## [10] 91270 132685 132686 147341 147342 266101 266102 266683 266684
## [19] 302205 302206 519505 519506 625735 625736 660431 660432 662379
## [28] 662380 662635 662636 662661 662662 672287 672288 789701 789702
## [37] 803555 803556 804963 804964 850903 850904 852743 852744 885291
## [46] 885292 1003143 1003144 1016873 1016874 1066663 1066664 1097045 1097046
## [55] 1181605 1181606 1219657 1219658 1221031 1221032 1284071 1284072 1284465
## [64] 1284466 1310821 1310822 1321549 1321550 1327407 1327408 1337029 1337030
## [73] 1346873 1346874 1367921 1367922 1422971 1422972 1441219 1441220 1517919
## [82] 1517920 1517925 1517926 1552999 1553000 1570079 1570080 1570499 1570500
## [91] 1709595 1709596 1727421 1727422 1756843 1756844 1774303 1774304 1803125
## [100] 1803126 1815307 1815308 1818119 1818120
(fly.FDR.prec.chr2 <- gsub("[^0-9]", "", fly.FDR.prec.chr2))
## [1] "4341" "4341" "20722" "20722" "24540" "24540" "42028" "42028"
## [9] "49396" "49396" "71603" "71603" "79614" "79614" "143754" "143754"
## [17] "144065" "144065" "163496" "163496" "281854" "281854" "339746" "339746"
## [25] "358648" "358648" "359700" "359700" "359838" "359838" "359851" "359851"
## [33] "365020" "365020" "428037" "428037" "435493" "435493" "436264" "436264"
## [41] "460821" "460821" "461800" "461800" "479192" "479192" "542516" "542516"
## [49] "549898" "549898" "576471" "576471" "592567" "592567" "638134" "638134"
## [57] "658660" "658660" "659375" "659375" "693165" "693165" "693375" "693375"
## [65] "707347" "707347" "713129" "713129" "716215" "716215" "721348" "721348"
## [73] "726550" "726550" "737646" "737646" "766843" "766843" "776548" "776548"
## [81] "817451" "817451" "817454" "817454" "835825" "835825" "844869" "844869"
## [89] "845091" "845091" "918074" "918074" "927644" "927644" "943283" "943283"
## [97] "952506" "952506" "967638" "967638" "973978" "973978" "975445" "975445"
data.frame(causal.set.chr2, fly.FDR.prec.chr2)
## causal.set.chr2 fly.FDR.prec.chr2
## 1 8335 4341
## 2 8336 4341
## 3 39089 20722
## 4 39090 20722
## 5 45925 24540
## 6 45926 24540
## 7 77569 42028
## 8 77570 42028
## 9 91269 49396
## 10 91270 49396
## 11 132685 71603
## 12 132686 71603
## 13 147341 79614
## 14 147342 79614
## 15 266101 143754
## 16 266102 143754
## 17 266683 144065
## 18 266684 144065
## 19 302205 163496
## 20 302206 163496
## 21 519505 281854
## 22 519506 281854
## 23 625735 339746
## 24 625736 339746
## 25 660431 358648
## 26 660432 358648
## 27 662379 359700
## 28 662380 359700
## 29 662635 359838
## 30 662636 359838
## 31 662661 359851
## 32 662662 359851
## 33 672287 365020
## 34 672288 365020
## 35 789701 428037
## 36 789702 428037
## 37 803555 435493
## 38 803556 435493
## 39 804963 436264
## 40 804964 436264
## 41 850903 460821
## 42 850904 460821
## 43 852743 461800
## 44 852744 461800
## 45 885291 479192
## 46 885292 479192
## 47 1003143 542516
## 48 1003144 542516
## 49 1016873 549898
## 50 1016874 549898
## 51 1066663 576471
## 52 1066664 576471
## 53 1097045 592567
## 54 1097046 592567
## 55 1181605 638134
## 56 1181606 638134
## 57 1219657 658660
## 58 1219658 658660
## 59 1221031 659375
## 60 1221032 659375
## 61 1284071 693165
## 62 1284072 693165
## 63 1284465 693375
## 64 1284466 693375
## 65 1310821 707347
## 66 1310822 707347
## 67 1321549 713129
## 68 1321550 713129
## 69 1327407 716215
## 70 1327408 716215
## 71 1337029 721348
## 72 1337030 721348
## 73 1346873 726550
## 74 1346874 726550
## 75 1367921 737646
## 76 1367922 737646
## 77 1422971 766843
## 78 1422972 766843
## 79 1441219 776548
## 80 1441220 776548
## 81 1517919 817451
## 82 1517920 817451
## 83 1517925 817454
## 84 1517926 817454
## 85 1552999 835825
## 86 1553000 835825
## 87 1570079 844869
## 88 1570080 844869
## 89 1570499 845091
## 90 1570500 845091
## 91 1709595 918074
## 92 1709596 918074
## 93 1727421 927644
## 94 1727422 927644
## 95 1756843 943283
## 96 1756844 943283
## 97 1774303 952506
## 98 1774304 952506
## 99 1803125 967638
## 100 1803126 967638
## 101 1815307 973978
## 102 1815308 973978
## 103 1818119 975445
## 104 1818120 975445
quartz()
layout(matrix(c(0, 1, 1, 0,
0, 1, 1, 0,
0, 2, 2, 0,
0, 2, 2, 0), nrow = 4, ncol = 4, byrow = TRUE))
plot(-log10(qvalues), pch = 19, cex = 1, bg = "black", col = alpha("gray", 0.3), xlab = "SNP", las = 1, main = "for")
points(causal.set[c(3, 5)], -log10(qvalues)[causal.set[c(3, 5)]], pch = 19, col = "red")
text(causal.set[c(3, 5)], (-log10(qvalues)[causal.set[c(3, 5)]] - 0.4), fly.FDR.prec[c(3, 5)], col = "red")
plot(-log10(qvalues.chr2), pch = 19, cex = 1, bg = "black", col = alpha("gray", 0.3), xlab = "SNP", las = 1, main = "Chr 2L")
points(causal.set.chr2[c(19, 20)], -log10(qvalues.chr2)[causal.set.chr2[c(19, 20)]], pch = 19, col = "red")
text(causal.set.chr2[c(19, 20)], (-log10(qvalues.chr2)[causal.set.chr2[c(19, 20)]] - 0.5), fly.FDR.prec.chr2[c(19, 20)], col = "red", cex = 0.8)
quartz()
## png("adaptation_revision2.png", width = 7, height = 7, units = "in", res = 300)
## pdf("adaptation_revision2.pdf")
layout(matrix(c(1, 1, 2, 2,
1, 1, 2, 2,
3, 3, 4, 4,
3, 3, 4, 4), nrow = 4, ncol = 4, byrow = FALSE))
hist(fly.pv1.prec$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "for", ylim = c(0, 350))
legend("top", paste("K = 9\nGIF = ", round(fly.pv1.prec$gif, 3)), bty = "n")
mtext("(a)", side = 2, at = 365, line = 2.5, las = 1, font = 2, family = "serif")
plot(-log10(qvalues), pch = 19, cex = 1, bg = "gray", col = alpha("gray", 0.6), xlab = "SNP", las = 1)
points(causal.set[c(3, 5)], -log10(qvalues)[causal.set[c(3, 5)]], pch = 19, col = "red")
text(causal.set[c(3, 5)], (-log10(qvalues)[causal.set[c(3, 5)]] - 0.4), fly.FDR.prec[c(3, 5)], col = "red")
par(mar = c(6, 7, 4.1, 1.5), mgp = c(4, 1.5, 0))
hist(fly.pv1.prec.chr2$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "Chr 2L", ylim = c(0, 120000))
legend("top", paste("K = 8\nGIF = ", round(fly.pv1.prec.chr2$gif, 3)), bty = "n")
mtext("(b)", side = 2, at = 122000, line = 4.5, las = 1, font = 2, family = "serif")
par(mar = c(5.1, 4.1, 4.1, 1.5), mgp = c(3, 1, 0))
plot(-log10(qvalues.chr2), pch = 19, cex = 1, bg = "black", col = alpha("gray", 0.3), xlab = "SNP", las = 1, main = "")
points(causal.set.chr2[-c(19, 20)], -log10(qvalues.chr2)[causal.set.chr2[-c(19, 20)]], pch = 19, col = "black")
points(causal.set.chr2[c(19, 20)], -log10(qvalues.chr2)[causal.set.chr2[c(19, 20)]], pch = 19, col = "red")
text(causal.set.chr2[c(19, 20)], (-log10(qvalues.chr2)[causal.set.chr2[c(19, 20)]] - 0.5), fly.FDR.prec.chr2[c(19, 20)], col = "red", cex = 0.8)
## dev.off()
Figure 5. Genotype-environment association test based on a latent factor mixed model with ridge penalty. (a) Distribution of adjusted p-values using the default genomic inflation factor, and the for loci (SNPs) potentially affected by precipitation. Likewise, (b) displays the same analysis for the entire chromosome 2L. Bold dots represent loci considered to be under selection according to a False Discovery Rate of 0.001.