index.knit

Data analysis for:

Dylan J. Padilla Perez, 2023. Geographic and seasonal variation of the for gene reveal signatures of local adaptation in Drosophila melanogaster.

Dylan J. Padilla Perez, School of Life Sciences, Arizona State University, Tempe, AZ 85287, USA.

Library

library(adegenet)
library(ade4)
library(car)
library(ComplexHeatmap)
library(data.table)
library(dartR)
library(ecodist)
library(hierfstat)
library(gplots)
library(LEA)
library(lfmm)
library(maps)
library(mapplots)
library(MASS)
library(PBSmapping)
library(pegas)
library(poppr)
library(qvalue)
library(randomcoloR)
library(raster)
library(reshape)
library(rworldmap)
library(scales)
library(SeqArray)
library(SeqVarTools)
library(shape)
library(SNPRelate)
library(stringr)
library(vcfR)
library(vegan)
library(xtable)

Session information

R.version

                 _                           
  platform       x86_64-apple-darwin17.0     
  arch           x86_64                      
  os             darwin17.0                  
  system         x86_64, darwin17.0          
  status                                     
  major          4                           
  minor          2.2                         
  year           2022                        
  month          10                          
  day            31                          
  svn rev        83211                       
  language       R                           
  version.string R version 4.2.2 (2022-10-31)
  nickname       Innocent and Trusting

sessionInfo()

  R version 4.2.2 (2022-10-31)
  Platform: x86_64-apple-darwin17.0 (64-bit)
  Running under: macOS Catalina 10.15.7
  
  Matrix products: default
  BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
  LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
  
  locale:
  [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
  
  attached base packages:
  [1] grid      stats     graphics  grDevices utils     datasets  methods  
  [8] base     
  
  other attached packages:
   [1] xtable_1.8-4          vegan_2.6-4           lattice_0.21-8       
   [4] permute_0.9-7         vcfR_1.14.0           stringr_1.5.0        
   [7] SNPRelate_1.32.2      shape_1.4.6           SeqVarTools_1.36.0   
  [10] SeqArray_1.38.0       gdsfmt_1.34.1         scales_1.2.1         
  [13] rworldmap_1.3-6       reshape_0.8.9         raster_3.6-20        
  [16] sp_1.6-0              randomcoloR_1.1.0.1   qvalue_2.30.0        
  [19] poppr_2.9.4           pegas_1.2             ape_5.7-1            
  [22] PBSmapping_2.73.2     MASS_7.3-59           mapplots_1.5.1       
  [25] maps_3.4.1            lfmm_1.1              LEA_3.10.2           
  [28] gplots_3.1.3          hierfstat_0.5-11      ecodist_2.0.9        
  [31] dartR_2.7.2           dartR.data_1.0.2      dplyr_1.1.2          
  [34] ggplot2_3.4.2         data.table_1.14.8     ComplexHeatmap_2.14.0
  [37] car_3.1-2             carData_3.0-5         adegenet_2.1.10      
  [40] ade4_1.7-22           rmarkdown_2.21       
  
  loaded via a namespace (and not attached):
    [1] utf8_1.2.3             R.utils_2.12.2         tidyselect_1.2.0      
    [4] combinat_0.0-8         Rtsne_0.16             maptools_1.1-6        
    [7] StAMPP_1.6.3           GWASExactHW_1.01       munsell_0.5.0         
   [10] codetools_0.2-19       withr_2.5.0            colorspace_2.1-0      
   [13] Biobase_2.58.0         knitr_1.42             stats4_4.2.2          
   [16] RgoogleMaps_1.4.5.3    logistf_1.24.1         GenomeInfoDbData_1.2.9
   [19] gap.datasets_0.0.5     vctrs_0.6.2            generics_0.1.3        
   [22] xfun_0.39              R6_2.5.1               doParallel_1.0.17     
   [25] GenomeInfoDb_1.34.9    clue_0.3-64            fields_14.1           
   [28] bitops_1.0-7           cachem_1.0.8           promises_1.2.0.1      
   [31] pinfsc50_1.2.0         gtable_0.3.3           formula.tools_1.7.1   
   [34] spam_2.9-1             rlang_1.1.1            calibrate_1.7.7       
   [37] GlobalOptions_0.1.2    splines_4.2.2          rgdal_1.6-6           
   [40] broom_1.0.4            yaml_2.3.7             reshape2_1.4.4        
   [43] abind_1.4-5            backports_1.4.1        httpuv_1.6.9          
   [46] tools_4.2.2            ellipsis_0.3.2         jquerylib_0.1.4       
   [49] RColorBrewer_1.1-3     BiocGenerics_0.44.0    Rcpp_1.0.10           
   [52] plyr_1.8.8             zlibbioc_1.44.0        purrr_1.0.1           
   [55] RCurl_1.98-1.12        GetoptLong_1.0.5       viridis_0.6.3         
   [58] S4Vectors_0.36.2       cluster_2.1.4          magrittr_2.0.3        
   [61] genetics_1.3.8.1.3     circlize_0.4.15        mvtnorm_1.1-3         
   [64] matrixStats_0.63.0     patchwork_1.1.2        mime_0.12             
   [67] evaluate_0.20          IRanges_2.32.0         gridExtra_2.3         
   [70] compiler_4.2.2         tibble_3.2.1           mice_3.15.0           
   [73] KernSmooth_2.23-20     V8_4.3.0               crayon_1.5.2          
   [76] gdistance_1.6.2        R.oo_1.25.0            htmltools_0.5.5       
   [79] mgcv_1.8-42            later_1.3.0            tidyr_1.3.0           
   [82] DBI_1.1.3              PopGenReport_3.0.7     boot_1.3-28.1         
   [85] Matrix_1.5-4           cli_3.6.1              R.methodsS3_1.8.2     
   [88] gdata_2.18.0.1         parallel_4.2.2         dotCall64_1.0-2       
   [91] igraph_1.4.2           GenomicRanges_1.50.2   pkgconfig_2.0.3       
   [94] foreign_0.8-84         terra_1.7-29           foreach_1.5.2         
   [97] bslib_0.4.2            XVector_0.38.0         digest_0.6.31         
  [100] polysat_1.7-7          Biostrings_2.66.0      operator.tools_1.6.3  
  [103] curl_5.0.0             gap_1.5-1              shiny_1.7.4           
  [106] gtools_3.9.4           rjson_0.2.21           lifecycle_1.0.3       
  [109] nlme_3.1-162           dismo_1.3-9            jsonlite_1.8.4        
  [112] seqinr_4.2-30          viridisLite_0.4.2      fansi_1.0.4           
  [115] pillar_1.9.0           GGally_2.1.2           fastmap_1.1.1         
  [118] glue_1.6.2             mmod_1.3.3             png_0.1-8             
  [121] iterators_1.0.14       stringi_1.7.12         sass_0.4.5            
  [124] caTools_1.18.2

Quality control and SNP isolation

## load meta-data file

samps <- fread("samps_10Nov2020.csv")

## open GDS for common SNPs (PoolSNP)

genofile <- seqOpen("dest.all.PoolSNP.001.50.10Nov2020.ann.gds", allow.duplicate = TRUE)

## common SNP.dt

snp.dt <- data.table(chr = seqGetData(genofile, "chromosome"),
                     pos = seqGetData(genofile, "position"),
                     nAlleles = seqGetData(genofile, "$num_allele"),
                     id = seqGetData(genofile, "variant.id"),
                     genotype = seqGetData(genofile, "allele"))

snp.dt <- snp.dt[nAlleles == 2]
seqSetFilter(genofile, snp.dt$id)

## # of selected variants: 4,281,164

## filter to target

snp.tmp <- data.table(chr ="2L", pos = 3622074:3656953)
setkey(snp.tmp, chr, pos)
setkey(snp.dt, chr, pos)
seqSetFilter(genofile, variant.id=snp.dt[J(snp.tmp), nomatch = 0]$id)

## # of selected variants: 1,613

## get annotations

message("Annotations")
tmp <- seqGetData(genofile, "annotation/info/ANN")
len1 <- tmp$length
len2 <- tmp$data

snp.dt1 <- data.table(len = rep(len1, times = len1), ann = len2, id = rep(snp.dt[J(snp.tmp), nomatch = 0]$id, times = len1))

## Extract data between the 2nd and third | symbol

snp.dt1[ ,class := tstrsplit(snp.dt1$ann,"\\|")[[2]]]
snp.dt1[ ,gene := tstrsplit(snp.dt1$ann,"\\|")[[4]]]

## Collapse additional annotations to original SNP vector length

snp.dt1.an <- snp.dt1[,list(n = length(class), col = paste(class, collapse = ","), gene = paste(gene, collapse = ",")), list(variant.id = id)]

snp.dt1.an[,col := tstrsplit(snp.dt1.an$col,"\\,")[[1]]]
snp.dt1.an[,gene := tstrsplit(snp.dt1.an$gene,"\\,")[[1]]]

## get frequencies

message("Allele Freqs")

ad <- seqGetData(genofile, "annotation/format/AD")
dp <- seqGetData(genofile, "annotation/format/DP")

af <- data.table(ad = expand.grid(ad$data)[,1],
                 dp = expand.grid(dp)[,1],
                 sampleId = rep(seqGetData(genofile, "sample.id"), dim(ad$data)[2]),
                 variant.id = rep(seqGetData(genofile, "variant.id"), each = dim(ad$data)[1]))

## merge them together

message("merge")
afi <- merge(af, snp.dt1.an, by = "variant.id")
afi <- merge(afi, snp.dt, by.x = "variant.id", by.y = "id")

afi[ , af := ad/dp]

## calculate effective read-depth

afis <- merge(afi, samps, by = "sampleId")

afis[chr == "X", nEff := round((dp*nFlies - 1)/(dp+nFlies))]
afis[chr != "X", nEff := round((dp*2*nFlies - 1)/(dp+2*nFlies))]
afis[ ,af_nEff := round(af*nEff)/nEff]


## subsetting dataset

names(afis)

##   [1] "sampleId"       "variant.id"     "ad"             "dp"            
##   [5] "n"              "col"            "gene"           "chr"           
##   [9] "pos"            "nAlleles"       "genotype"       "af"            
##  [13] "locality"       "country"        "city"           "collectionDate"
##  [17] "DateExact"      "lat"            "long"           "season"        
##  [21] "type"           "continent"      "set"            "nFlies"        
##  [25] "SRA_accession"  "SRA_experiment" "Model"          "collector"     
##  [29] "sampleType"     "year"           "yday"           "stationId"     
##  [33] "dist_km"        "In(2L)t"        "In(2R)Ns"       "In(3L)P"       
##  [37] "In(3R)C"        "In(3R)K"        "In(3R)Mo"       "In(3R)Payne"   
##  [41] "status"         "bio1"           "bio2"           "bio3"          
##  [45] "bio4"           "bio5"           "bio6"           "bio7"          
##  [49] "bio8"           "bio9"           "bio10"          "bio11"         
##  [53] "bio12"          "bio13"          "bio14"          "bio15"         
##  [57] "bio16"          "bio17"          "bio18"          "bio19"         
##  [61] "tmin1"          "tmin2"          "tmin3"          "tmin4"         
##  [65] "tmin5"          "tmin6"          "tmin7"          "tmin8"         
##  [69] "tmin9"          "tmin10"         "tmin11"         "tmin12"        
##  [73] "tmax1"          "tmax2"          "tmax3"          "tmax4"         
##  [77] "tmax5"          "tmax6"          "tmax7"          "tmax8"         
##  [81] "tmax9"          "tmax10"         "tmax11"         "tmax12"        
##  [85] "prec1"          "prec2"          "prec3"          "prec4"         
##  [89] "prec5"          "prec6"          "prec7"          "prec8"         
##  [93] "prec9"          "prec10"         "prec11"         "prec12"        
##  [97] "propSimNorm"    "sex"            "nEff"           "af_nEff"

season.dat <- as.data.frame(afis[ , c(21, 30, 1, 13, 14, 15, 19, 18, 20, 2, 11, 12, 6, 22, 99)])
str(season.dat)

## 'data.frame':    438736 obs. of  15 variables:
##  $ type      : chr  "pooled" "pooled" "pooled" "pooled" ...
##  $ year      : int  2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
##  $ sampleId  : chr  "AT_Mau_14_01" "AT_Mau_14_01" "AT_Mau_14_01" "AT_Mau_14_01" ...
##  $ locality  : chr  "AT_Mau" "AT_Mau" "AT_Mau" "AT_Mau" ...
##  $ country   : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ city      : chr  "Mauternbach" "Mauternbach" "Mauternbach" "Mauternbach" ...
##  $ long      : num  15.6 15.6 15.6 15.6 15.6 ...
##  $ lat       : num  48.4 48.4 48.4 48.4 48.4 ...
##  $ season    : chr  "spring" "spring" "spring" "spring" ...
##  $ variant.id: int  162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
##  $ genotype  : chr  "A,C" "A,C" "C,T" "T,C" ...
##  $ af        : num  1 0 0.1569 0.0196 NA ...
##  $ col       : chr  "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" ...
##  $ continent : chr  "Europe" "Europe" "Europe" "Europe" ...
##  $ nEff      : num  28 26 31 31 NA 36 36 33 24 30 ...

head(season.dat)

##     type year     sampleId locality country        city  long    lat season
## 1 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
## 2 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
## 3 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
## 4 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
## 5 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
## 6 pooled 2014 AT_Mau_14_01   AT_Mau Austria Mauternbach 15.56 48.375 spring
##   variant.id genotype         af                 col continent nEff
## 1     162333      A,C 1.00000000 3_prime_UTR_variant    Europe   28
## 2     162334      A,C 0.00000000 3_prime_UTR_variant    Europe   26
## 3     162335      C,T 0.15686275 3_prime_UTR_variant    Europe   31
## 4     162336      T,C 0.01960784 3_prime_UTR_variant    Europe   31
## 5     162337      C,T         NA 3_prime_UTR_variant    Europe   NA
## 6     162338      C,T 0.00000000 3_prime_UTR_variant    Europe   36

dim(season.dat)

## [1] 438736     15

season.dat$season[season.dat$season == "frost"] <- "fall"

season.dat$season <- as.factor(season.dat$season)
season.filter.dat <- data.frame()

localities <- levels(as.factor(season.dat$locality))
seasons <- c("fall", "spring")

## getting samples collected at least once during spring and winter

for(loc in localities){
    dft <- season.dat[season.dat$locality == loc, ]
    if(all(seasons %in% unique(dft$season))){
        season.filter.dat <- rbind(season.filter.dat, dft)
    }
}

dim(season.filter.dat)

## [1] 285501     15

dim(na.omit(season.filter.dat))

## [1] 269454     15

str(season.filter.dat)

## 'data.frame':    285501 obs. of  15 variables:
##  $ type      : chr  "pooled" "pooled" "pooled" "pooled" ...
##  $ year      : int  2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
##  $ sampleId  : chr  "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
##  $ locality  : chr  "AT_gr" "AT_gr" "AT_gr" "AT_gr" ...
##  $ country   : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ city      : chr  "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
##  $ long      : num  16.4 16.4 16.4 16.4 16.4 ...
##  $ lat       : num  48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 48.2 ...
##  $ season    : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
##  $ variant.id: int  162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
##  $ genotype  : chr  "A,C" "A,C" "C,T" "T,C" ...
##  $ af        : num  0.9773 NA 0.2273 0 0.0244 ...
##  $ col       : chr  "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" "3_prime_UTR_variant" ...
##  $ continent : chr  "Europe" "Europe" "Europe" "Europe" ...
##  $ nEff      : num  29 NA 29 33 28 28 27 25 27 25 ...

mat1 <- season.filter.dat[ , c(3, 6, 5, 9, 10, 14, 12, 15)]
names(mat1)

## [1] "sampleId"   "city"       "country"    "season"     "variant.id"
## [6] "continent"  "af"         "nEff"

str(mat1)

## 'data.frame':    285501 obs. of  8 variables:
##  $ sampleId  : chr  "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
##  $ city      : chr  "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
##  $ country   : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ season    : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
##  $ variant.id: int  162333 162334 162335 162336 162337 162338 162339 162340 162341 162342 ...
##  $ continent : chr  "Europe" "Europe" "Europe" "Europe" ...
##  $ af        : num  0.9773 NA 0.2273 0 0.0244 ...
##  $ nEff      : num  29 NA 29 33 28 28 27 25 27 25 ...

dim(mat1)

## [1] 285501      8

mat1 <- mat1[!is.na(mat1$nEff), ]
dim(mat1)

## [1] 269454      8

mat1 <- mat1[mat1$nEff >= 28, ] ## Applying a Neff filter of 28
dim(mat1)

## [1] 212017      8

mat1 <- mat1[ , -8]
str(mat1)

## 'data.frame':    212017 obs. of  7 variables:
##  $ sampleId  : chr  "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
##  $ city      : chr  "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" "Gross-Enzersdorf" ...
##  $ country   : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ season    : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
##  $ variant.id: int  162333 162335 162336 162337 162338 162350 162351 162352 162353 162357 ...
##  $ continent : chr  "Europe" "Europe" "Europe" "Europe" ...
##  $ af        : num  0.9773 0.2273 0 0.0244 0 ...

## Changing name of cities

mat1$city <- as.factor(mat1$city)
levels(mat1$city)

##  [1] "Akaa"                        "Athens"                     
##  [3] "Barcelona"                   "Benton Harbor"              
##  [5] "Broggingen"                  "Brzezina"                   
##  [7] "Chalet \xe0 Gobet"           "Charlottesville"            
##  [9] "Charlotttesville"            "Chornobyl (Cooling pond)"   
## [11] "Chornobyl Applegarden"       "Chornobyl Applegarden (New)"
## [13] "Chornobyl Kopachi"           "Chornobyl Polisske"         
## [15] "Chornobyl Yaniv"             "Cross Plains"               
## [17] "Drogobych"                   "Esparto"                    
## [19] "Gimenells"                   "Gotheron"                   
## [21] "Gross-Enzersdorf"            "Homestead"                  
## [23] "Ithaca"                      "Kalna"                      
## [25] "Karensminde"                 "Karensminde Orchard"        
## [27] "Kharkiv"                     "Kyiv"                       
## [29] "Kyiv (Vyshneve)"             "Lancaster"                  
## [31] "Linvilla"                    "Market Harborough"          
## [33] "Mauternbach"                 "Munich"                     
## [35] "Odesa"                       "Odessa"                     
## [37] "Sheffield"                   "Slankamen. Vinogradi"       
## [39] "State College"               "Sudbury"                    
## [41] "Topeka"                      "Tuolumne"                   
## [43] "valday"                      "Valday"                     
## [45] "Viltain"                     "Yesiloz"                    
## [47] "Yesiloz TR13"                "Yesiloz TR14"               
## [49] "Yesiloz TR15"                "Yesiloz TR16"               
## [51] "Yesiloz TR17"                "Yesiloz TR18"

levels(mat1$city) <- gsub("valday", "Valday", levels(mat1$city))
levels(mat1$city) <- gsub("Odesa", "Odessa", levels(mat1$city))
levels(mat1$city) <- gsub("Charlotttesville", "Charlottesville", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR13", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR14", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR15", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR16", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR17", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Yesiloz TR18", "Yesiloz", levels(mat1$city))
levels(mat1$city) <- gsub("Karensminde Orchard", "Karensminde", levels(mat1$city))
levels(mat1$city) <- gsub("Slankamen. Vinogradi", "Slankamen-Vinogradi", levels(mat1$city))
levels(mat1$city) <- gsub("Chalet \xe0 Gobet", "Chalet-A-Gobet", levels(mat1$city))

levels(mat1$city)

##  [1] "Akaa"                        "Athens"                     
##  [3] "Barcelona"                   "Benton Harbor"              
##  [5] "Broggingen"                  "Brzezina"                   
##  [7] "Chalet-A-Gobet"              "Charlottesville"            
##  [9] "Chornobyl (Cooling pond)"    "Chornobyl Applegarden"      
## [11] "Chornobyl Applegarden (New)" "Chornobyl Kopachi"          
## [13] "Chornobyl Polisske"          "Chornobyl Yaniv"            
## [15] "Cross Plains"                "Drogobych"                  
## [17] "Esparto"                     "Gimenells"                  
## [19] "Gotheron"                    "Gross-Enzersdorf"           
## [21] "Homestead"                   "Ithaca"                     
## [23] "Kalna"                       "Karensminde"                
## [25] "Kharkiv"                     "Kyiv"                       
## [27] "Kyiv (Vyshneve)"             "Lancaster"                  
## [29] "Linvilla"                    "Market Harborough"          
## [31] "Mauternbach"                 "Munich"                     
## [33] "Odessa"                      "Sheffield"                  
## [35] "Slankamen-Vinogradi"         "State College"              
## [37] "Sudbury"                     "Topeka"                     
## [39] "Tuolumne"                    "Valday"                     
## [41] "Viltain"                     "Yesiloz"

levels(mat1$city)[c(9, 10, 11, 12, 13, 14)] <- "Chornobyl"
levels(mat1$city)

##  [1] "Akaa"                "Athens"              "Barcelona"          
##  [4] "Benton Harbor"       "Broggingen"          "Brzezina"           
##  [7] "Chalet-A-Gobet"      "Charlottesville"     "Chornobyl"          
## [10] "Cross Plains"        "Drogobych"           "Esparto"            
## [13] "Gimenells"           "Gotheron"            "Gross-Enzersdorf"   
## [16] "Homestead"           "Ithaca"              "Kalna"              
## [19] "Karensminde"         "Kharkiv"             "Kyiv"               
## [22] "Kyiv (Vyshneve)"     "Lancaster"           "Linvilla"           
## [25] "Market Harborough"   "Mauternbach"         "Munich"             
## [28] "Odessa"              "Sheffield"           "Slankamen-Vinogradi"
## [31] "State College"       "Sudbury"             "Topeka"             
## [34] "Tuolumne"            "Valday"              "Viltain"            
## [37] "Yesiloz"

levels(mat1$city)[22] <- "Kyiv"
levels(mat1$city)

##  [1] "Akaa"                "Athens"              "Barcelona"          
##  [4] "Benton Harbor"       "Broggingen"          "Brzezina"           
##  [7] "Chalet-A-Gobet"      "Charlottesville"     "Chornobyl"          
## [10] "Cross Plains"        "Drogobych"           "Esparto"            
## [13] "Gimenells"           "Gotheron"            "Gross-Enzersdorf"   
## [16] "Homestead"           "Ithaca"              "Kalna"              
## [19] "Karensminde"         "Kharkiv"             "Kyiv"               
## [22] "Lancaster"           "Linvilla"            "Market Harborough"  
## [25] "Mauternbach"         "Munich"              "Odessa"             
## [28] "Sheffield"           "Slankamen-Vinogradi" "State College"      
## [31] "Sudbury"             "Topeka"              "Tuolumne"           
## [34] "Valday"              "Viltain"             "Yesiloz"

levels(mat1$city)[29] <- "Slankamenacki-Vinogradi"
levels(mat1$city)

##  [1] "Akaa"                    "Athens"                 
##  [3] "Barcelona"               "Benton Harbor"          
##  [5] "Broggingen"              "Brzezina"               
##  [7] "Chalet-A-Gobet"          "Charlottesville"        
##  [9] "Chornobyl"               "Cross Plains"           
## [11] "Drogobych"               "Esparto"                
## [13] "Gimenells"               "Gotheron"               
## [15] "Gross-Enzersdorf"        "Homestead"              
## [17] "Ithaca"                  "Kalna"                  
## [19] "Karensminde"             "Kharkiv"                
## [21] "Kyiv"                    "Lancaster"              
## [23] "Linvilla"                "Market Harborough"      
## [25] "Mauternbach"             "Munich"                 
## [27] "Odessa"                  "Sheffield"              
## [29] "Slankamenacki-Vinogradi" "State College"          
## [31] "Sudbury"                 "Topeka"                 
## [33] "Tuolumne"                "Valday"                 
## [35] "Viltain"                 "Yesiloz"

dim(mat1)

## [1] 212017      7

str(mat1)

## 'data.frame':    212017 obs. of  7 variables:
##  $ sampleId  : chr  "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" "AT_gr_12_fall" ...
##  $ city      : Factor w/ 36 levels "Akaa","Athens",..: 15 15 15 15 15 15 15 15 15 15 ...
##  $ country   : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ season    : Factor w/ 2 levels "fall","spring": 1 1 1 1 1 1 1 1 1 1 ...
##  $ variant.id: int  162333 162335 162336 162337 162338 162350 162351 162352 162353 162357 ...
##  $ continent : chr  "Europe" "Europe" "Europe" "Europe" ...
##  $ af        : num  0.9773 0.2273 0 0.0244 0 ...

mat1 <- mat1[!mat1$country == "United Kingdom", ] ## The united kingdom only has 4 pooled samples, so I removed it from the analyses

mat2 <- reshape(mat1, idvar = c("sampleId", "city", "country", "season", "continent"), timevar = "variant.id", direction = "wide")
mat2[1:6, 1:10]

##              sampleId             city country season continent af.162333
## 9679    AT_gr_12_fall Gross-Enzersdorf Austria   fall    Europe 0.9772727
## 11292 AT_gr_12_spring Gross-Enzersdorf Austria spring    Europe 0.8888889
## 1        AT_Mau_14_01      Mauternbach Austria spring    Europe 1.0000000
## 1617     AT_Mau_14_02      Mauternbach Austria   fall    Europe        NA
## 3227     AT_Mau_15_50      Mauternbach Austria spring    Europe 0.7714286
## 4840     AT_Mau_15_51      Mauternbach Austria   fall    Europe 0.6984127
##       af.162335  af.162336  af.162337 af.162338
## 9679  0.2272727 0.00000000 0.02439024         0
## 11292        NA 0.00000000         NA        NA
## 1     0.1568627 0.01960784         NA         0
## 1617         NA 0.00000000 0.00000000         0
## 3227  0.1323529 0.00000000 0.00000000         0
## 4840  0.2656250 0.01282051 0.00000000         0

dim(mat2)

## [1]  170 1614

mat2$city <- as.character(mat2$city)
mat2$season <- as.character(mat2$season)

mat2 <- mat2[mat2$city != "Homestead", ]
mat2 <- mat2[mat2$city != "Athens", ]
mat2 <- mat2[mat2$city != "Sudbury", ]
mat2 <- mat2[mat2$city != "Brzezina", ]
mat2 <- mat2[mat2$city != "Kalna", ]
mat2 <- mat2[mat2$city != "Slankamenacki-Vinogradi", ]
mat2 <- mat2[mat2$city != "Topeka", ]
mat2 <- mat2[mat2$city != "Chalet-A-Gobet", ]

usa <- c("Lancaster", "Ithaca", "Cross Plains", "Benton Harbor", "Linvilla", "State College", "Tuolumne", "Esparto", "Charlottesville")

for(i in usa){
    mat2$country[mat2$city == i] <- i
}

mat2$country <- as.factor(mat2$country)

levels(mat2$country)

##  [1] "Austria"         "Benton Harbor"   "Charlottesville" "Cross Plains"   
##  [5] "Denmark"         "Esparto"         "Finland"         "France"         
##  [9] "Germany"         "Ithaca"          "Lancaster"       "Linvilla"       
## [13] "Russia"          "Spain"           "State College"   "Tuolumne"       
## [17] "Turkey"          "Ukraine"

levels(mat2$country)[c(2, 4)] <- "MI-WI"
levels(mat2$country)

##  [1] "Austria"         "MI-WI"           "Charlottesville" "Denmark"        
##  [5] "Esparto"         "Finland"         "France"          "Germany"        
##  [9] "Ithaca"          "Lancaster"       "Linvilla"        "Russia"         
## [13] "Spain"           "State College"   "Tuolumne"        "Turkey"         
## [17] "Ukraine"

levels(mat2$country)[c(9, 10)] <- "NY-MA"
levels(mat2$country)

##  [1] "Austria"         "MI-WI"           "Charlottesville" "Denmark"        
##  [5] "Esparto"         "Finland"         "France"          "Germany"        
##  [9] "NY-MA"           "Linvilla"        "Russia"          "Spain"          
## [13] "State College"   "Tuolumne"        "Turkey"          "Ukraine"

levels(mat2$country)[c(10, 13)] <- "PA"
levels(mat2$country)

##  [1] "Austria"         "MI-WI"           "Charlottesville" "Denmark"        
##  [5] "Esparto"         "Finland"         "France"          "Germany"        
##  [9] "NY-MA"           "PA"              "Russia"          "Spain"          
## [13] "Tuolumne"        "Turkey"          "Ukraine"

mat3 <- as.data.frame(getGenotypeAlleles(genofile)) ## extracting genotypes from the GDS file
mat3[1:5, 1:5]

##              162333 162334 162335 162336 162337
## AT_Mau_14_01    C/C    A/A    C/T    T/C   <NA>
## AT_Mau_14_02    A/C    A/A    C/T    T/T    C/C
## AT_Mau_15_50    A/C    A/A    C/T    T/T    C/C
## AT_Mau_15_51    A/C   <NA>    C/T    T/C    C/C
## AT_See_14_44    A/C    A/C    C/T    T/T    C/C

for(i in 1:ncol(mat3)){
    mat3[ , i] <- gsub("/", "", mat3[, i])
}

mat3[1:5, 1:5]

##              162333 162334 162335 162336 162337
## AT_Mau_14_01     CC     AA     CT     TC   <NA>
## AT_Mau_14_02     AC     AA     CT     TT     CC
## AT_Mau_15_50     AC     AA     CT     TT     CC
## AT_Mau_15_51     AC   <NA>     CT     TC     CC
## AT_See_14_44     AC     AC     CT     TT     CC

mat3$ind <- rownames(mat3)
dim(mat3)

## [1]  272 1614

mat3[1:5, 1610:1614]

##              163994 163995 163996 163997          ind
## AT_Mau_14_01     GA     GG     GC     AA AT_Mau_14_01
## AT_Mau_14_02     GA     GG     GC     AT AT_Mau_14_02
## AT_Mau_15_50     GA     GG     GC     AA AT_Mau_15_50
## AT_Mau_15_51     GA     GG     GC     AA AT_Mau_15_51
## AT_See_14_44     GA     GA     GC     AA AT_See_14_44

mat4 <- mat3[mat3$ind %in% mat2$sampleId, ]
dim(mat4)

## [1]  152 1614

mat4[1:5, 1600:1614]

##               163983 163984 163985 163986 163987 163988 163989 163991 163992
## AT_Mau_14_01      AA     AG     TT     CG     AG     CC     AC     CC     AG
## AT_Mau_14_02      AA     AG     TT     CG     AG     CC     AC     CC     AA
## AT_Mau_15_50      AA     AG     TC     CG     AG     CT     AC     CG     AG
## AT_Mau_15_51      AA     AG     TT     CG     AG     CC     AC     CG     AA
## AT_gr_12_fall     AA     AG     TT     CG     AG     CT     AC     CG     AA
##               163993 163994 163995 163996 163997           ind
## AT_Mau_14_01      TC     GA     GG     GC     AA  AT_Mau_14_01
## AT_Mau_14_02      TC     GA     GG     GC     AT  AT_Mau_14_02
## AT_Mau_15_50      TC     GA     GG     GC     AA  AT_Mau_15_50
## AT_Mau_15_51      TC     GA     GG     GC     AA  AT_Mau_15_51
## AT_gr_12_fall     TC     GA     GG     GC     AA AT_gr_12_fall

same_ord <- mat2[ , c(1, 2, 3, 4)]
colnames(same_ord) <- c("ind", "city", "country", "season")
mat5 <- merge(mat4, same_ord, by = "ind")
dim(mat5)

## [1]  152 1617

mat5[1:5, 1:5]

##               ind 162333 162334 162335 162336
## 1   AT_gr_12_fall     AC   <NA>     CT     TT
## 2 AT_gr_12_spring     AC     AA     CT     TT
## 3    AT_Mau_14_01     CC     AA     CT     TC
## 4    AT_Mau_14_02     AC     AA     CT     TT
## 5    AT_Mau_15_50     AC     AA     CT     TT

rownames(mat5) <- mat5[ , 1]
dim(mat5)

## [1]  152 1617

mat5[1:5, 1610:1617]

##                 163993 163994 163995 163996 163997             city country
## AT_gr_12_fall       TC     GA     GG     GC     AA Gross-Enzersdorf Austria
## AT_gr_12_spring     TC     GA     GG     GC     AA Gross-Enzersdorf Austria
## AT_Mau_14_01        TC     GA     GG     GC     AA      Mauternbach Austria
## AT_Mau_14_02        TC     GA     GG     GC     AT      Mauternbach Austria
## AT_Mau_15_50        TC     GA     GG     GC     AA      Mauternbach Austria
##                 season
## AT_gr_12_fall     fall
## AT_gr_12_spring spring
## AT_Mau_14_01    spring
## AT_Mau_14_02      fall
## AT_Mau_15_50    spring

mat6 <- mat5[ , -c(1, 1615, 1616, 1617)]
dim(mat6)

## [1]  152 1613

mat6[1:5, 1:5]

##                 162333 162334 162335 162336 162337
## AT_gr_12_fall       AC   <NA>     CT     TT     CT
## AT_gr_12_spring     AC     AA     CT     TT     CC
## AT_Mau_14_01        CC     AA     CT     TC   <NA>
## AT_Mau_14_02        AC     AA     CT     TT     CC
## AT_Mau_15_50        AC     AA     CT     TT     CC

mat6[1:5, 1605:1613]

##                 163988 163989 163991 163992 163993 163994 163995 163996 163997
## AT_gr_12_fall       CT     AC     CG     AA     TC     GA     GG     GC     AA
## AT_gr_12_spring     CC     AA     CG     AA     TC     GA     GG     GC     AA
## AT_Mau_14_01        CC     AC     CC     AG     TC     GA     GG     GC     AA
## AT_Mau_14_02        CC     AC     CC     AA     TC     GA     GG     GC     AT
## AT_Mau_15_50        CT     AC     CG     AG     TC     GA     GG     GC     AA

fly_gen <- df2genind(mat6, ploidy = 2, ind.names = rownames(mat6), pop = mat5$country, sep = "")
fly_gen

## /// GENIND OBJECT /////////
## 
##  // 152 individuals; 1,613 loci; 3,225 alleles; size: 2.7 Mb
## 
##  // Basic content
##    @tab:  152 x 3225 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 1-2)
##    @loc.fac: locus factor for the 3225 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: df2genind(X = mat6, sep = "", ind.names = rownames(mat6), pop = mat5$country, 
##     ploidy = 2)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)

fly_gen$tab[1:5, 1:5]

##                 162333.A 162333.C 162334.A 162334.C 162335.C
## AT_gr_12_fall          1        1       NA       NA        1
## AT_gr_12_spring        1        1        2        0        1
## AT_Mau_14_01           0        2        2        0        1
## AT_Mau_14_02           1        1        2        0        1
## AT_Mau_15_50           1        1        2        0        1

summary(fly_gen$pop)

##         Austria           MI-WI Charlottesville         Denmark         Esparto 
##               6               9               6               5               5 
##         Finland          France         Germany           NY-MA              PA 
##               6              10              12               6              18 
##          Russia           Spain        Tuolumne          Turkey         Ukraine 
##               6               7               5              13              38

indmiss_fly <- propTyped(fly_gen, by = "ind")
indmiss_fly[which(indmiss_fly < 0.80)]

## PA_li_10_spring 
##       0.7123373

barplot(indmiss_fly, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Genotypes", side = 1, line = 3.7)

fly_gen <- missingno(fly_gen, type = "geno", cutoff = 0.20) ## This is the genind file that I will use for all the analyses
fly_gen

## /// GENIND OBJECT /////////
## 
##  // 151 individuals; 1,613 loci; 3,225 alleles; size: 2.7 Mb
## 
##  // Basic content
##    @tab:  151 x 3225 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 1-2)
##    @loc.fac: locus factor for the 3225 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)

## quality control begins here

mlg(fly_gen) ## keep only polymorphic loci

## #############################
## # Number of Individuals:  151 
## # Number of MLG:  151 
## #############################

## [1] 151

isPoly(fly_gen) %>% summary

##    Mode   FALSE    TRUE 
## logical       1    1612

poly_loci <- names(which(isPoly(fly_gen) == TRUE))
fly_gen2 <- fly_gen[loc = poly_loci]
isPoly(fly_gen2) %>% summary

##    Mode    TRUE 
## logical    1612

fly_gen2$loc.n.all <- fly_gen2$loc.n.all[which(fly_gen2$loc.n.all == 2)]

n <- names(which(nAll(fly_gen2) == 2))
fly_gen2 <- fly_gen2[loc = n]
fly_gen2

## /// GENIND OBJECT /////////
## 
##  // 151 individuals; 1,612 loci; 3,224 alleles; size: 2.7 Mb
## 
##  // Basic content
##    @tab:  151 x 3224 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 3224 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)

indmiss_fly2 <- propTyped(fly_gen2, by = "ind")

barplot(indmiss_fly2, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Genotypes", side = 1, line = 3.7)

locmiss_fly <- propTyped(fly_gen2, by = "loc")
locmiss_fly[which(locmiss_fly < 0.80)] ## print loci with < 80% complete genotypes

##    162334    162396    162412    162545    162546    162547    162549    162551 
## 0.6225166 0.6092715 0.7880795 0.7947020 0.7682119 0.7947020 0.7748344 0.7880795 
##    162553    162554    162555    162557    162560    162563    162587    162623 
## 0.7880795 0.7814570 0.7880795 0.7947020 0.7814570 0.7947020 0.7748344 0.5298013 
##    162746    162755    162756    162882    162892    162986    162987    162988 
## 0.4238411 0.5562914 0.5562914 0.7748344 0.4238411 0.7152318 0.7152318 0.7284768 
##    162989    162990    162992    163029    163072    163073    163074    163075 
## 0.7284768 0.7284768 0.7549669 0.6291391 0.4834437 0.4370861 0.4834437 0.4834437 
##    163076    163118    163119    163120    163121    163122    163123    163124 
## 0.4834437 0.6688742 0.6688742 0.6688742 0.6688742 0.6688742 0.6158940 0.6158940 
##    163125    163307    163363    163365    163366    163608    163648    163650 
## 0.4370861 0.4768212 0.7947020 0.7947020 0.7483444 0.7350993 0.5165563 0.5827815 
##    163692    163715    163716    163717    163765    163821    163908    163910 
## 0.4569536 0.7086093 0.7086093 0.7086093 0.6357616 0.6092715 0.5231788 0.5629139 
##    163914    163915    163916    163917    163922    163958 
## 0.6158940 0.5894040 0.5827815 0.5761589 0.6953642 0.7019868

barplot(locmiss_fly, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Locus", side = 1, line = 3)

fly_gen3 <- missingno(fly_gen2, type = "loci", cutoff = 0.20)
fly_gen3

## /// GENIND OBJECT /////////
## 
##  // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
## 
##  // Basic content
##    @tab:  151 x 3100 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 3100 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)

locmiss_fly3 <- propTyped(fly_gen3, by = "loc")
locmiss_fly3[which(locmiss_fly3 < 0.80)]

## named numeric(0)

fly_gen3$tab[1:5, 1:5]

##                 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall          1        1        1        1        2
## AT_gr_12_spring        1        1        1        1        2
## AT_Mau_14_01           0        2        1        1        1
## AT_Mau_14_02           1        1        1        1        2
## AT_Mau_15_50           1        1        1        1        2

min(ploidy(fly_gen3))

## [1] 2

max(ploidy(fly_gen3))

## [1] 2

summary(fly_gen3$pop)

##         Austria           MI-WI Charlottesville         Denmark         Esparto 
##               6               9               6               5               5 
##         Finland          France         Germany           NY-MA              PA 
##               6              10              12               6              17 
##          Russia           Spain        Tuolumne          Turkey         Ukraine 
##               6               7               5              13              38

mlg(fly_gen3)

## #############################
## # Number of Individuals:  151 
## # Number of MLG:  151 
## #############################

## [1] 151

isPoly(fly_gen3) %>% summary

##    Mode    TRUE 
## logical    1550

poly_loci3 <- names(which(isPoly(fly_gen3) == TRUE))
fly_gen3 <- fly_gen3[loc = poly_loci3]
isPoly(fly_gen3) %>% summary

##    Mode    TRUE 
## logical    1550

barplot(locmiss_fly3, ylim = c(0, 1), ylab = "Complete genotypes (proportion)", xlab = "", las = 2, cex.names = 0.4)
mtext("Locus", side = 1, line = 3)

## computing basic stats

basic_fly <- basic.stats(fly_gen3, diploid = TRUE)

## testing for H-W equilibrium

## Chi-squared test: p-value

HWE.test <- data.frame(sapply(seppop(fly_gen3), 
                              function(ls) pegas::hw.test(ls, B = 0)[ , 3]))


HWE.test.chisq <- t(data.matrix(HWE.test))
HWE.test.chisq[1:5, 1:5]

##                     162333     162335    162336    162337    162338
## Austria         0.08018122 0.01430588 0.6242061 0.8037847 1.0000000
## MI.WI           0.08968602 0.04722090 1.0000000 1.0000000 0.5485062
## Charlottesville 0.23013934 0.02534732 1.0000000 0.8237839 0.8037847
## Denmark         0.33790402 0.02534732 1.0000000 0.5761501 1.0000000
## Esparto         0.23013934 0.04550026 1.0000000 0.7750970 0.8037847

## Monte Carlo: p-value

HWE.test <- data.frame(sapply(seppop(fly_gen3), 
                              function(ls) pegas::hw.test(ls, B = 1000)[ , 4]))

HWE.test.MC <- t(data.matrix(HWE.test))
HWE.test.MC[1:5, 1:5]

##                 162333 162335 162336 162337 162338
## Austria          0.359  0.083      1      1      1
## MI.WI            0.411  0.165      1      1      1
## Charlottesville  1.000  0.115      1      1      1
## Denmark          1.000  0.123      1      1      1
## Esparto          1.000  0.299      1      1      1

alpha <- 0.05

Prop.loci.out.of.HWE <- data.frame(Chisq = apply(HWE.test.chisq < alpha, 2, mean), MC = apply(HWE.test.MC < alpha, 2, mean))

Prop.loci.out.of.HWE[1:10, ]

##            Chisq        MC
## 162333 0.3333333 0.2666667
## 162335 1.0000000 0.4000000
## 162336 0.0000000 0.0000000
## 162337 0.0000000 0.0000000
## 162338 0.0000000 0.0000000
## 162339 0.0000000 0.0000000
## 162340 0.0000000 0.0000000
## 162341 1.0000000 0.4666667
## 162342 0.0000000 0.0000000
## 162343 0.0000000 0.0000000

## "False discovery rate" correction for the number of tests. Here I use the function ‘p.adjust’ with the argument method = "fdr" to adjust the p-values from the previous tests

Chisq.fdr <- matrix(p.adjust(HWE.test.chisq, method = "fdr"), 
                    nrow = nrow(HWE.test.chisq))

MC.fdr <- matrix(p.adjust(HWE.test.MC, method = "fdr"), 
                    nrow = nrow(HWE.test.MC))

Prop.loci.out.of.HWE <- data.frame(Chisq = apply(HWE.test.chisq<alpha, 2, mean), 
           MC = apply(HWE.test.MC<alpha, 2, mean),
           Chisq.fdr = apply(Chisq.fdr<alpha, 2, mean),
           MC.fdr = apply(MC.fdr<alpha, 2, mean))

head(Prop.loci.out.of.HWE)

##            Chisq        MC Chisq.fdr    MC.fdr
## 162333 0.3333333 0.2666667 0.2666667 0.1333333
## 162335 1.0000000 0.4000000 0.6000000 0.2666667
## 162336 0.0000000 0.0000000 0.0000000 0.0000000
## 162337 0.0000000 0.0000000 0.0000000 0.0000000
## 162338 0.0000000 0.0000000 0.0000000 0.0000000
## 162339 0.0000000 0.0000000 0.0000000 0.0000000

loci <- data.frame()

idx <- 1

for(i in 1:nrow(Prop.loci.out.of.HWE)){
    fdr <- Prop.loci.out.of.HWE[i , ]
    if(fdr$MC.fdr > 0.5){
        loci <- rbind(fdr, loci)
    }

}

loci ## none of the loci are consistenly out of HWE. Some of them are out of HWE in less than 50% of the populations.

## data frame with 0 columns and 0 rows

dim(loci)

## [1] 0 0

## check for linkage disequilibrium (LD)

LD <- as.genclone(fly_gen3)
LD

## 
## This is a genclone object
## -------------------------
## Genotype information:
## 
##     151 original multilocus genotypes 
##     151 diploid individuals
##    1550 codominant loci
## 
## Population information:
## 
##       0 strata. 
##      15 populations defined - 
## Austria, MI-WI, Charlottesville, ..., Tuolumne, Turkey, Ukraine

quartz()
LD.general <- poppr::ia(LD, sample = 500)

LD.general

##           Ia         p.Ia        rbarD         p.rD 
## 15.592940300  0.001996008  0.013771123  0.001996008

Computing pairwise Fst test

fly_fst <- genet.dist(fly_gen3, method = "WC84") %>% round(digits = 3)

fst.mat <- as.matrix(fly_fst)
fst.mat[fst.mat < 0] <- 0
fst.mat[1:10, 1:10]

##                 Austria MI-WI Charlottesville Denmark Esparto Finland France
## Austria           0.000 0.053           0.058   0.018   0.056   0.015  0.021
## MI-WI             0.053 0.000           0.005   0.049   0.012   0.049  0.042
## Charlottesville   0.058 0.005           0.000   0.054   0.017   0.057  0.045
## Denmark           0.018 0.049           0.054   0.000   0.049   0.024  0.018
## Esparto           0.056 0.012           0.017   0.049   0.000   0.054  0.041
## Finland           0.015 0.049           0.057   0.024   0.054   0.000  0.022
## France            0.021 0.042           0.045   0.018   0.041   0.022  0.000
## Germany           0.012 0.041           0.046   0.016   0.043   0.013  0.010
## NY-MA             0.044 0.006           0.003   0.040   0.009   0.044  0.033
## PA                0.053 0.004           0.004   0.050   0.013   0.051  0.044
##                 Germany NY-MA    PA
## Austria           0.012 0.044 0.053
## MI-WI             0.041 0.006 0.004
## Charlottesville   0.046 0.003 0.004
## Denmark           0.016 0.040 0.050
## Esparto           0.043 0.009 0.013
## Finland           0.013 0.044 0.051
## France            0.010 0.033 0.044
## Germany           0.000 0.034 0.041
## NY-MA             0.034 0.000 0.002
## PA                0.041 0.002 0.000

tab <- gi2gl(fly_gen3, parallel = FALSE, verbose = NULL) ## converting genind object to genlight object

## Starting gi2gl 
## Starting gl.compliance.check 
##   Processing genlight object with SNP data
##   Checking coding of SNPs
##     SNP data scored NA, 0, 1 or 2 confirmed
##   Checking locus metrics and flags
##   Recalculating locus metrics
##   Checking for monomorphic loci
##     No monomorphic loci detected
##   Checking for loci with all missing data
##     No loci with all missing data detected
##   Checking whether individual names are unique.
##   Checking for individual metrics
##   Warning: Creating a slot for individual metrics
##   Checking for population assignments
##     Population assignments confirmed
##   Spelling of coordinates checked and changed if necessary to 
##             lat/lon
## Completed: gl.compliance.check 
## Completed: gi2gl

pval_fst <- gl.fst.pop(tab, nboots = 1000, percent = 95, nclusters = 1, verbose = NULL)

## Starting gl.fst.pop 
##   Processing genlight object with SNP data
## Completed: gl.fst.pop

fst <- as.matrix(as.dist(pval_fst$Fsts))
fst[fst < 0] <- 0
fst

##                    Austria     Esparto    Tuolumne     Germany    Denmark
## Austria         0.00000000 0.055703474 0.058895752 0.012410932 0.01800624
## Esparto         0.05570347 0.000000000 0.000000000 0.043366976 0.04899978
## Tuolumne        0.05889575 0.000000000 0.000000000 0.046380979 0.05326210
## Germany         0.01241093 0.043366976 0.046380979 0.000000000 0.01632085
## Denmark         0.01800624 0.048999783 0.053262097 0.016320855 0.00000000
## Spain           0.01644031 0.044356991 0.047779471 0.007609145 0.02535346
## Finland         0.01510726 0.054481851 0.060073773 0.013480562 0.02446933
## France          0.02086204 0.040896003 0.045503573 0.009973570 0.01796563
## NY-MA           0.04387058 0.008737692 0.007955468 0.033551239 0.04003517
## MI-WI           0.05336238 0.012416407 0.008962355 0.041320541 0.04880945
## PA              0.05310220 0.013471651 0.009573028 0.041313299 0.05015960
## Russia          0.01689966 0.053516381 0.060654501 0.020656074 0.02332836
## Turkey          0.01412191 0.060541318 0.062921149 0.018900400 0.02179383
## Ukraine         0.01416309 0.063774691 0.069481577 0.020327235 0.02369749
## Charlottesville 0.05819820 0.017272103 0.015222898 0.046035247 0.05402774
##                       Spain    Finland     France       NY-MA       MI-WI
## Austria         0.016440309 0.01510726 0.02086204 0.043870575 0.053362379
## Esparto         0.044356991 0.05448185 0.04089600 0.008737692 0.012416407
## Tuolumne        0.047779471 0.06007377 0.04550357 0.007955468 0.008962355
## Germany         0.007609145 0.01348056 0.00997357 0.033551239 0.041320541
## Denmark         0.025353461 0.02446933 0.01796563 0.040035167 0.048809450
## Spain           0.000000000 0.02001428 0.01567832 0.029541647 0.039313950
## Finland         0.020014275 0.00000000 0.02212788 0.043508846 0.048678641
## France          0.015678321 0.02212788 0.00000000 0.033457193 0.042244680
## NY-MA           0.029541647 0.04350885 0.03345719 0.000000000 0.005774646
## MI-WI           0.039313950 0.04867864 0.04224468 0.005774646 0.000000000
## PA              0.038958914 0.05145801 0.04367546 0.001936861 0.004351338
## Russia          0.031639501 0.01913517 0.02199934 0.047419819 0.052957069
## Turkey          0.026809159 0.01677821 0.02823027 0.049358359 0.054304445
## Ukraine         0.031831900 0.02245250 0.02881299 0.055798661 0.064183546
## Charlottesville 0.046471038 0.05728551 0.04450952 0.002951612 0.004506388
##                          PA      Russia     Turkey     Ukraine Charlottesville
## Austria         0.053102204 0.016899657 0.01412191 0.014163086     0.058198201
## Esparto         0.013471651 0.053516381 0.06054132 0.063774691     0.017272103
## Tuolumne        0.009573028 0.060654501 0.06292115 0.069481577     0.015222898
## Germany         0.041313299 0.020656074 0.01890040 0.020327235     0.046035247
## Denmark         0.050159598 0.023328359 0.02179383 0.023697489     0.054027742
## Spain           0.038958914 0.031639501 0.02680916 0.031831900     0.046471038
## Finland         0.051458012 0.019135166 0.01677821 0.022452499     0.057285510
## France          0.043675456 0.021999335 0.02823027 0.028812985     0.044509519
## NY-MA           0.001936861 0.047419819 0.04935836 0.055798661     0.002951612
## MI-WI           0.004351338 0.052957069 0.05430445 0.064183546     0.004506388
## PA              0.000000000 0.058255707 0.05610334 0.066041832     0.004019586
## Russia          0.058255707 0.000000000 0.01717863 0.007515034     0.059645911
## Turkey          0.056103340 0.017178630 0.00000000 0.012939846     0.058036081
## Ukraine         0.066041832 0.007515034 0.01293985 0.000000000     0.067300473
## Charlottesville 0.004019586 0.059645911 0.05803608 0.067300473     0.000000000

## png("heatmap.png", width = 7, height = 7, units = "in", res = 300)

## pdf("heatmap.pdf")

my_palette <- colorRampPalette(c("blue", "red"))(n = 100)

heatmap.2(fst.mat, density.info = "none", trace = "none", scale = "none", cexRow = 0.7, cexCol = 0.7, key.title = "Fst", srtCol = 45, srtRow = 35, margins = c(8.5, 5), col = my_palette, symbreaks = TRUE)

## dev.off()

Figure 1. Heatmap depicting genetic differentiation among populations of D. melanogaster based on pairwise Fst values. In some cases, samples from adjacent localities were combined to avoid low sample sizes. Abbreviations are as follows: PA = Pennsylvania; MI-WI = Michigan and Wisconsin; NY-MA = New York and Massachusetts.

Genetic admixture proportions

##seqSetFilter(genofile, variant.id = rownames(snps), sample.id = rownames(indi))
##seqGDS2VCF(genofile, "my_vcf.gz", verbose = TRUE)
##outest <- read.vcfR("my_vcf.gz")
##dataout <- vcfR2genind(outest)


## exploring the data once again (do it all the time!)

fly_gen3

## /// GENIND OBJECT /////////
## 
##  // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
## 
##  // Basic content
##    @tab:  151 x 3100 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 3100 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)

fly_gen3$tab[1:5, 1:5]

##                 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall          1        1        1        1        2
## AT_gr_12_spring        1        1        1        1        2
## AT_Mau_14_01           0        2        1        1        1
## AT_Mau_14_02           1        1        1        1        2
## AT_Mau_15_50           1        1        1        1        2

nLoc(fly_gen3) # number of loci

## [1] 1550

nPop(fly_gen3) # number of sites

## [1] 15

nInd(fly_gen3) # number of individuals

## [1] 151

summary(fly_gen3$pop) # sample size

##         Austria           MI-WI Charlottesville         Denmark         Esparto 
##               6               9               6               5               5 
##         Finland          France         Germany           NY-MA              PA 
##               6              10              12               6              17 
##          Russia           Spain        Tuolumne          Turkey         Ukraine 
##               6               7               5              13              38

min(ploidy(fly_gen3))

## [1] 2

max(ploidy(fly_gen3))

## [1] 2

## gl2geno(tab, outfile = "gl_geno", outpath = getwd(), verbose = NULL) ## Converting genlight object to geno object

## Run snmf algorithm

#snmf1 <- snmf("gl_geno.geno",
#             K = 1:10, # number of K ancestral populations to run
#             repetitions = 50, # 50 repetitions for each K
#             entropy = TRUE, # calculate cross-entropy
#             project = "new")


snmf1 <- load.snmfProject("gl_geno.snmfProject")


## extract Q-matrix for the best run

plot(snmf1, col = "blue", cex = 1.5, pch = 19, las = 1)

## extract the cross-entropy of all runs where K = 9

ce <- cross.entropy(snmf1, K = 9)
ce

##            K = 9
## run 1  0.3026330
## run 2  0.3006478
## run 3  0.2965800
## run 4  0.2934025
## run 5  0.2985019
## run 6  0.3038042
## run 7  0.3018757
## run 8  0.2907803
## run 9  0.2968350
## run 10 0.2961763
## run 11 0.2993841
## run 12 0.2929597
## run 13 0.2910719
## run 14 0.2818298
## run 15 0.2994594
## run 16 0.2894918
## run 17 0.2949548
## run 18 0.2914579
## run 19 0.2884607
## run 20 0.3000826
## run 21 0.2908541
## run 22 0.2981863
## run 23 0.2902612
## run 24 0.2904714
## run 25 0.3029123
## run 26 0.3011128
## run 27 0.2925949
## run 28 0.2941099
## run 29 0.3023984
## run 30 0.3020126
## run 31 0.2983558
## run 32 0.2997775
## run 33 0.2902648
## run 34 0.2993205
## run 35 0.2987137
## run 36 0.2952065
## run 37 0.2994274
## run 38 0.3042761
## run 39 0.2976461
## run 40 0.2993694
## run 41 0.2967565
## run 42 0.2944768
## run 43 0.2941358
## run 44 0.2882492
## run 45 0.2983567
## run 46 0.2942568
## run 47 0.2906855
## run 48 0.2999784
## run 49 0.3022837
## run 50 0.2979496

## find the run with the lowest cross-entropy

lowest.ce <- which.min(ce)
lowest.ce

## [1] 14

## extract Q-matrix for the best run

qmatrix <- as.data.frame(Q(snmf1, K = 9, run = lowest.ce))
head(qmatrix)

##            V1          V2         V3          V4          V5       V6
## 1 9.99639e-05 9.99639e-05 0.25625200 9.99639e-05 0.000755320 0.613557
## 2 9.99730e-05 9.99730e-05 0.06442830 1.41056e-01 0.178439000 0.299589
## 3 9.99730e-05 7.78667e-02 0.24535100 2.57084e-02 0.000099973 0.328833
## 4 9.99730e-05 1.64043e-01 0.00834852 9.99730e-05 0.018455600 0.145086
## 5 1.32607e-02 4.37458e-02 0.59060000 9.99640e-05 0.000099964 0.184407
## 6 9.99640e-05 1.04134e-01 0.11331700 9.99640e-05 0.036707400 0.319622
##            V7          V8          V9
## 1 9.99639e-05 0.049494200 0.079542400
## 2 1.66984e-02 0.000099973 0.299490000
## 3 9.99730e-05 0.188641000 0.133300000
## 4 9.99730e-05 0.516370000 0.147398000
## 5 9.99640e-05 0.167587000 0.000099964
## 6 9.99640e-05 0.425819000 0.000099964

## changing order of levels

pops <- fly_gen3$pop
levels(pops)

##  [1] "Austria"         "MI-WI"           "Charlottesville" "Denmark"        
##  [5] "Esparto"         "Finland"         "France"          "Germany"        
##  [9] "NY-MA"           "PA"              "Russia"          "Spain"          
## [13] "Tuolumne"        "Turkey"          "Ukraine"

levels(pops)[3] <- "Charlott."


qmplot <- cbind(qmatrix, pops)
qmplot$pops <- factor(qmplot$pops, levels = c("Esparto", "Tuolumne", "MI-WI", "Charlott.", "PA", "NY-MA", "Spain", "France", "Germany", "Austria", "Denmark", "Finland", "Ukraine", "Russia", "Turkey"))
qmplot <- qmplot[order(qmplot$pops), ]

pops <- qmplot$pops

## png("admixture_revision3.png", width = 7, height = 7, units = "in", res = 300)

## pdf("admixture_revision3.pdf")

layout(matrix(c(0, 1, 1, 0,
                0, 1, 1, 0,
                2, 2, 2, 2,
                2, 2, 2, 2), nrow = 4, ncol = 4, byrow = TRUE))

par(mar = c(4, 4, 1, 0))
plot(snmf1, col = "blue", cex = 1.5, pch = 19, las = 1, cex.lab = 1.3)
Arrows(x = 9, y = 0.295, x1 = 9, y1 = 0.305, col = "red", arr.type = "triangle", code = 1, lwd = 1.5, arr.length = 0.2)
mtext("(a)", side = 2, at = 0.342, line = 2.8, font = 2, family = "serif", las = 1)

par(mar = c(5, 4.5, 1, 0))
barplot(t(qmplot[1:9]), col = RColorBrewer::brewer.pal(9,"Paired"), border = NA, space = 0, xlab = "", xaxt = "n",  ylab = "Admixture proportion", las = 1, cex.lab = 1.4)
ticks <- c(0, 6, 11, 20, 26, 43, 49, 56, 66, 78, 84, 89, 95, 133, 139)

## adding population labels to the axis:

names <- unique(as.character(pops))

#medians <- c()

for(i in ticks){
#for(i in 1:length(pops)){

    axis(1, at = i, labels = "")
    #axis(1, at = median(which(pops == pops[i])), labels = "")
    #medians <- c(medians, median(which(pops == pops[i])))
}

#medians[1:10]
text(x = c(3, 8, 15, 25, 29, 47, 53, 60, 73, 82, 87, 94, 101, 136, 143), y = par("usr")[3] - 0.05, labels = names, xpd = NA, srt = 35, cex = 1, adj = 1)
#text(x = as.numeric(unique(as.character(medians))), y = par("usr")[3] - 0.05, labels = names, xpd = NA, srt = 35, cex = 1, adj = 1.2)
mtext("Pools", side = 1, line = 4)
mtext("(b)", side = 2, at = 1.1, line = 2.4, font = 2, family = "serif", las = 1)

## dev.off()

Figure 2. Population structure analysis based on individual ancestry coefficients for a number of ancestral populations. (a) Cross-entropy values for each snmf run with k ranging between k = 1 and k = 10. The red arrow indicates the most likely value of k. (b) Admixture proportion across populations of D. melanogaster. Tick marks on the x axis correspond to the median number of pools per locality. Colors indicate genetic clusters.

## label column names of qmatrix

ncol(qmatrix)

## [1] 9

cluster_names = c()

for(i in 1:ncol(qmatrix)){
  cluster_names[i] = paste("Cluster", i)
}

cluster_names

## [1] "Cluster 1" "Cluster 2" "Cluster 3" "Cluster 4" "Cluster 5" "Cluster 6"
## [7] "Cluster 7" "Cluster 8" "Cluster 9"

colnames(qmatrix) <- cluster_names
head(qmatrix)

##     Cluster 1   Cluster 2  Cluster 3   Cluster 4   Cluster 5 Cluster 6
## 1 9.99639e-05 9.99639e-05 0.25625200 9.99639e-05 0.000755320  0.613557
## 2 9.99730e-05 9.99730e-05 0.06442830 1.41056e-01 0.178439000  0.299589
## 3 9.99730e-05 7.78667e-02 0.24535100 2.57084e-02 0.000099973  0.328833
## 4 9.99730e-05 1.64043e-01 0.00834852 9.99730e-05 0.018455600  0.145086
## 5 1.32607e-02 4.37458e-02 0.59060000 9.99640e-05 0.000099964  0.184407
## 6 9.99640e-05 1.04134e-01 0.11331700 9.99640e-05 0.036707400  0.319622
##     Cluster 7   Cluster 8   Cluster 9
## 1 9.99639e-05 0.049494200 0.079542400
## 2 1.66984e-02 0.000099973 0.299490000
## 3 9.99730e-05 0.188641000 0.133300000
## 4 9.99730e-05 0.516370000 0.147398000
## 5 9.99640e-05 0.167587000 0.000099964
## 6 9.99640e-05 0.425819000 0.000099964

dim(qmatrix)

## [1] 151   9

## add individual IDs

qmatrix$Ind <- indNames(fly_gen3)

## add site IDs

qmatrix$Site <- fly_gen3$pop
head(qmatrix)

##     Cluster 1   Cluster 2  Cluster 3   Cluster 4   Cluster 5 Cluster 6
## 1 9.99639e-05 9.99639e-05 0.25625200 9.99639e-05 0.000755320  0.613557
## 2 9.99730e-05 9.99730e-05 0.06442830 1.41056e-01 0.178439000  0.299589
## 3 9.99730e-05 7.78667e-02 0.24535100 2.57084e-02 0.000099973  0.328833
## 4 9.99730e-05 1.64043e-01 0.00834852 9.99730e-05 0.018455600  0.145086
## 5 1.32607e-02 4.37458e-02 0.59060000 9.99640e-05 0.000099964  0.184407
## 6 9.99640e-05 1.04134e-01 0.11331700 9.99640e-05 0.036707400  0.319622
##     Cluster 7   Cluster 8   Cluster 9             Ind    Site
## 1 9.99639e-05 0.049494200 0.079542400   AT_gr_12_fall Austria
## 2 1.66984e-02 0.000099973 0.299490000 AT_gr_12_spring Austria
## 3 9.99730e-05 0.188641000 0.133300000    AT_Mau_14_01 Austria
## 4 9.99730e-05 0.516370000 0.147398000    AT_Mau_14_02 Austria
## 5 9.99640e-05 0.167587000 0.000099964    AT_Mau_15_50 Austria
## 6 9.99640e-05 0.425819000 0.000099964    AT_Mau_15_51 Austria

## calculate mean admixture proportions for each site

clusters <- grep("Cluster", names(qmatrix)) ## indexes of cluster columns
avg_admix <- aggregate(qmatrix[, clusters], list(qmatrix$Site, qmatrix$Ind), mean)
colnames(avg_admix)[1:2] <- c("country", "Ind")
head(avg_admix)

##   country             Ind   Cluster 1   Cluster 2  Cluster 3   Cluster 4
## 1 Austria   AT_gr_12_fall 9.99639e-05 9.99639e-05 0.25625200 9.99639e-05
## 2 Austria AT_gr_12_spring 9.99730e-05 9.99730e-05 0.06442830 1.41056e-01
## 3 Austria    AT_Mau_14_01 9.99730e-05 7.78667e-02 0.24535100 2.57084e-02
## 4 Austria    AT_Mau_14_02 9.99730e-05 1.64043e-01 0.00834852 9.99730e-05
## 5 Austria    AT_Mau_15_50 1.32607e-02 4.37458e-02 0.59060000 9.99640e-05
## 6 Austria    AT_Mau_15_51 9.99640e-05 1.04134e-01 0.11331700 9.99640e-05
##     Cluster 5 Cluster 6   Cluster 7   Cluster 8   Cluster 9
## 1 0.000755320  0.613557 9.99639e-05 0.049494200 0.079542400
## 2 0.178439000  0.299589 1.66984e-02 0.000099973 0.299490000
## 3 0.000099973  0.328833 9.99730e-05 0.188641000 0.133300000
## 4 0.018455600  0.145086 9.99730e-05 0.516370000 0.147398000
## 5 0.000099964  0.184407 9.99640e-05 0.167587000 0.000099964
## 6 0.036707400  0.319622 9.99640e-05 0.425819000 0.000099964

str(avg_admix)

## 'data.frame':    151 obs. of  11 variables:
##  $ country  : Factor w/ 15 levels "Austria","MI-WI",..: 1 1 1 1 1 1 5 5 5 5 ...
##  $ Ind      : chr  "AT_gr_12_fall" "AT_gr_12_spring" "AT_Mau_14_01" "AT_Mau_14_02" ...
##  $ Cluster 1: num  0.0001 0.0001 0.0001 0.0001 0.0133 ...
##  $ Cluster 2: num  0.0001 0.0001 0.0779 0.164 0.0437 ...
##  $ Cluster 3: num  0.25625 0.06443 0.24535 0.00835 0.5906 ...
##  $ Cluster 4: num  0.0001 0.1411 0.0257 0.0001 0.0001 ...
##  $ Cluster 5: num  0.000755 0.178439 0.0001 0.018456 0.0001 ...
##  $ Cluster 6: num  0.614 0.3 0.329 0.145 0.184 ...
##  $ Cluster 7: num  0.0001 0.0167 0.0001 0.0001 0.0001 ...
##  $ Cluster 8: num  0.0495 0.0001 0.1886 0.5164 0.1676 ...
##  $ Cluster 9: num  0.0795 0.2995 0.1333 0.1474 0.0001 ...

## import csv file containing coordinates

coor <- read.csv("coor.csv")
str(coor)

## 'data.frame':    16 obs. of  3 variables:
##  $ country: chr  "Austria" "Esparto" "Tuolumne" "Germany" ...
##  $ long   : num  15.96 -122.05 -120.26 9.71 10.21 ...
##  $ lat    : num  48.3 38.7 38 48.2 55.9 ...

head(coor)

##    country        long      lat
## 1  Austria   15.965000 48.28750
## 2  Esparto -122.046000 38.68000
## 3 Tuolumne -120.260000 37.97000
## 4  Germany    9.714757 48.19901
## 5  Denmark   10.212586 55.94513
## 6    Spain    1.279236 41.51821

admix <- merge(coor, avg_admix, by = "country")
str(admix)

## 'data.frame':    151 obs. of  13 variables:
##  $ country  : chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ long     : num  16 16 16 16 16 ...
##  $ lat      : num  48.3 48.3 48.3 48.3 48.3 ...
##  $ Ind      : chr  "AT_gr_12_fall" "AT_gr_12_spring" "AT_Mau_14_01" "AT_Mau_14_02" ...
##  $ Cluster 1: num  0.0001 0.0001 0.0001 0.0001 0.0133 ...
##  $ Cluster 2: num  0.0001 0.0001 0.0779 0.164 0.0437 ...
##  $ Cluster 3: num  0.25625 0.06443 0.24535 0.00835 0.5906 ...
##  $ Cluster 4: num  0.0001 0.1411 0.0257 0.0001 0.0001 ...
##  $ Cluster 5: num  0.000755 0.178439 0.0001 0.018456 0.0001 ...
##  $ Cluster 6: num  0.614 0.3 0.329 0.145 0.184 ...
##  $ Cluster 7: num  0.0001 0.0167 0.0001 0.0001 0.0001 ...
##  $ Cluster 8: num  0.0495 0.0001 0.1886 0.5164 0.1676 ...
##  $ Cluster 9: num  0.0795 0.2995 0.1333 0.1474 0.0001 ...

head(admix)

##   country   long     lat             Ind   Cluster 1   Cluster 2  Cluster 3
## 1 Austria 15.965 48.2875   AT_gr_12_fall 9.99639e-05 9.99639e-05 0.25625200
## 2 Austria 15.965 48.2875 AT_gr_12_spring 9.99730e-05 9.99730e-05 0.06442830
## 3 Austria 15.965 48.2875    AT_Mau_14_01 9.99730e-05 7.78667e-02 0.24535100
## 4 Austria 15.965 48.2875    AT_Mau_14_02 9.99730e-05 1.64043e-01 0.00834852
## 5 Austria 15.965 48.2875    AT_Mau_15_50 1.32607e-02 4.37458e-02 0.59060000
## 6 Austria 15.965 48.2875    AT_Mau_15_51 9.99640e-05 1.04134e-01 0.11331700
##     Cluster 4   Cluster 5 Cluster 6   Cluster 7   Cluster 8   Cluster 9
## 1 9.99639e-05 0.000755320  0.613557 9.99639e-05 0.049494200 0.079542400
## 2 1.41056e-01 0.178439000  0.299589 1.66984e-02 0.000099973 0.299490000
## 3 2.57084e-02 0.000099973  0.328833 9.99730e-05 0.188641000 0.133300000
## 4 9.99730e-05 0.018455600  0.145086 9.99730e-05 0.516370000 0.147398000
## 5 9.99640e-05 0.000099964  0.184407 9.99640e-05 0.167587000 0.000099964
## 6 9.99640e-05 0.036707400  0.319622 9.99640e-05 0.425819000 0.000099964

colnames(admix)[c(5, 6, 7, 8, 9, 10, 11, 12, 13)] <- c("cluster1", "cluster2", "cluster3", "cluster4", "cluster5", "cluster6", "cluster7", "cluster8", "cluster9")

pies <- admix[ , c(1:3, 5:13)]

meanpies <- aggregate(pies[2:12], by = list(pies$country), mean)

Plotting admixture proportions on a map

## Plotting map

## png("map_revision.png", width = 7, height = 7, units = "in", res = 300)

## pdf("map_revision.pdf")

quartz()

layout(matrix(c(1, 1, 1, 1,
                1, 1, 1, 1,
                2, 2, 2, 2,
                2, 2, 2, 2), nrow = 4, ncol = 4, byrow = TRUE))

map("state", fill = TRUE, col = "beige", border = "gray", asp = 1)
map.axes(cex.axis = 1, las = 1)
mtext(side = 1, line = 3, "Longitude")
mtext(side = 2, line = 3, "Latitude")
addCompass(-120, 29, rot = 0, useWest = TRUE, cex = 0.5, col.compass = "gainsboro")

## Adding pie charts to the map

for(i in 1:nrow(meanpies)){
    add.pie(z = c(meanpies$cluster1[i], meanpies$cluster2[i], meanpies$cluster3[i], meanpies$cluster4[i], meanpies$cluster5[i], meanpies$cluster6[i], meanpies$cluster7[i], meanpies$cluster8[i], meanpies$cluster9[i]), x = meanpies$long[i], y = meanpies$lat[i], radius = 1.5, col = RColorBrewer::brewer.pal(9,"Paired"), labels = "", border = FALSE)
    
}


newmap <- getMap(resolution = "low")
plot(newmap, col = "beige", border = "gray", xlim = c(-10, 40), ylim = c(35, 71), asp = 1)
map.axes(cex.axis = 1, las = 1)
mtext(side = 1, line = 3, "Longitude")
mtext(side = 2, line = 3, "Latitude")
addCompass(-16, 40, rot = 0, useWest = TRUE, cex = 0.5, col.compass = "gainsboro")


## Adding pie charts to the map

for(i in 1:nrow(meanpies)){
    add.pie(z = c(meanpies$cluster1[i], meanpies$cluster2[i], meanpies$cluster3[i], meanpies$cluster4[i], meanpies$cluster5[i], meanpies$cluster6[i], meanpies$cluster7[i], meanpies$cluster8[i], meanpies$cluster9[i]), x = meanpies$long[i], y = meanpies$lat[i], radius = 2.5, col = RColorBrewer::brewer.pal(9,"Paired"), labels = "", border = FALSE)
    
}

## dev.off()

Figure 3. Mean admixture proportions across populations of D. melanogaster surveyed in America (upper panel) and Europe (lower panel). Colors in the pie charts represent genetic clusters.

Isolation by Environment and Isolation by Distance analyses

unique(fly_gen3$pop)

##  [1] Austria         Esparto         Tuolumne        Germany        
##  [5] Denmark         Spain           Finland         France         
##  [9] NY-MA           MI-WI           PA              Russia         
## [13] Turkey          Ukraine         Charlottesville
## 15 Levels: Austria MI-WI Charlottesville Denmark Esparto Finland ... Ukraine

obj <- matrix(NA, nrow = length(unique(fly_gen3$pop)), ncol = 3, byrow = TRUE)

idx <- 1

for(i in unique(fly_gen3$pop)){
    long <- admix$long[admix$country == i][1]
    lat <- admix$lat[admix$country == i][1]
    country <- i
    obj[idx, ] <- c(country, long, lat)
    idx = idx + 1

}

obj

##       [,1]              [,2]          [,3]         
##  [1,] "Austria"         "15.965"      "48.2875"    
##  [2,] "Esparto"         "-122.046"    "38.68"      
##  [3,] "Tuolumne"        "-120.26"     "37.97"      
##  [4,] "Germany"         "9.7147565"   "48.199012"  
##  [5,] "Denmark"         "10.212586"   "55.945128"  
##  [6,] "Spain"           "1.279236"    "41.518208"  
##  [7,] "Finland"         "23.52"       "61.1"       
##  [8,] "France"          "3.5442054"   "46.8656662" 
##  [9,] "NY-MA"           "-74.085"     "42.445"     
## [10,] "MI-WI"           "-88.04915"   "42.61055"   
## [11,] "PA"              "-76.6350005" "40.3366975" 
## [12,] "Russia"          "33.235277"   "58.02444"   
## [13,] "Turkey"          "32.260328"   "40.231444"  
## [14,] "Ukraine"         "30.16599445" "49.47387778"
## [15,] "Charlottesville" "-78.47"      "38"

obj <- as.data.frame(obj)
colnames(obj) <- c("country", "x", "y")
obj$x <- as.numeric(obj$x)
obj$y <- as.numeric(obj$y)
rownames(obj) <- obj[ , 1]
obj <- obj[ , c(2, 3)]
str(obj)

## 'data.frame':    15 obs. of  2 variables:
##  $ x: num  15.96 -122.05 -120.26 9.71 10.21 ...
##  $ y: num  48.3 38.7 38 48.2 55.9 ...

fly_gen3@other <- obj[ , c(1, 2)]
fly_gen3

## /// GENIND OBJECT /////////
## 
##  // 151 individuals; 1,550 loci; 3,100 alleles; size: 2.6 Mb
## 
##  // Basic content
##    @tab:  151 x 3100 matrix of allele counts
##    @loc.n.all: number of alleles per locus (range: 2-2)
##    @loc.fac: locus factor for the 3100 columns of @tab
##    @all.names: list of allele names for each locus
##    @ploidy: ploidy of each individual  (range: 2-2)
##    @type:  codom
##    @call: .local(x = x, i = i, j = j, loc = ..1, drop = drop)
## 
##  // Optional content
##    @pop: population of each individual (group size range: 5-38)
##    @other: a list containing: x  y

## genetic distance

GD.pop.PairwiseFst.hierfstat <- as.dist(hierfstat::pairwise.neifst(hierfstat::genind2hierfstat(fly_gen3)))
GD <- as.matrix(GD.pop.PairwiseFst.hierfstat)

USA <- c("MI-WI", "Charlottesville", "Esparto", "NY-MA", "PA", "Tuolumne")

GD <- as.dist(GD[!rownames(GD) %in% USA, !colnames(GD) %in% USA])
GD

##         Austria Denmark Finland France Germany Russia  Spain Turkey
## Denmark  0.0181                                                    
## Finland  0.0151  0.0246                                            
## France   0.0212  0.0186  0.0222                                    
## Germany  0.0128  0.0170  0.0136 0.0100                             
## Russia   0.0169  0.0234  0.0192 0.0222  0.0210                     
## Spain    0.0165  0.0256  0.0200 0.0158  0.0077 0.0318              
## Turkey   0.0139  0.0217  0.0163 0.0280  0.0188 0.0167 0.0264       
## Ukraine  0.0137  0.0234  0.0214 0.0281  0.0196 0.0067 0.0308 0.0129

## geographic distance

Dgeo <- round(dist(fly_gen3$other), 3)
Dgeo <- as.matrix(Dgeo)
Dgeo <- as.dist(Dgeo[!rownames(Dgeo) %in% USA, !colnames(Dgeo) %in% USA])

## making sure the order of rows and columns are the same in all of the matrices

ord.gen <- as.matrix(GD)
ord.geo <- as.matrix(Dgeo)

ord.geo

##         Austria Germany Denmark  Spain Finland France Russia Turkey Ukraine
## Austria   0.000   6.251   9.578 16.171  14.874 12.502 19.826 18.178  14.250
## Germany   6.251   0.000   7.762 10.761  18.895  6.313 25.490 23.912  20.491
## Denmark   9.578   7.762   0.000 16.969  14.271 11.265 23.116 27.074  20.977
## Spain    16.171  10.761  16.969  0.000  29.633  5.807 35.967 31.008  29.962
## Finland  14.874  18.895  14.271 29.633   0.000 24.529 10.190 22.625  13.392
## France   12.502   6.313  11.265  5.807  24.529  0.000 31.719 29.473  26.749
## Russia   19.826  25.490  23.116 35.967  10.190 31.719  0.000 17.820   9.085
## Turkey   18.178  23.912  27.074 31.008  22.625 29.473 17.820  0.000   9.477
## Ukraine  14.250  20.491  20.977 29.962  13.392 26.749  9.085  9.477   0.000

ord.gen <- ord.gen[rownames(ord.geo), colnames(ord.geo)]
ord.gen

##         Austria Germany Denmark  Spain Finland France Russia Turkey Ukraine
## Austria  0.0000  0.0128  0.0181 0.0165  0.0151 0.0212 0.0169 0.0139  0.0137
## Germany  0.0128  0.0000  0.0170 0.0077  0.0136 0.0100 0.0210 0.0188  0.0196
## Denmark  0.0181  0.0170  0.0000 0.0256  0.0246 0.0186 0.0234 0.0217  0.0234
## Spain    0.0165  0.0077  0.0256 0.0000  0.0200 0.0158 0.0318 0.0264  0.0308
## Finland  0.0151  0.0136  0.0246 0.0200  0.0000 0.0222 0.0192 0.0163  0.0214
## France   0.0212  0.0100  0.0186 0.0158  0.0222 0.0000 0.0222 0.0280  0.0281
## Russia   0.0169  0.0210  0.0234 0.0318  0.0192 0.0222 0.0000 0.0167  0.0067
## Turkey   0.0139  0.0188  0.0217 0.0264  0.0163 0.0280 0.0167 0.0000  0.0129
## Ukraine  0.0137  0.0196  0.0234 0.0308  0.0214 0.0281 0.0067 0.0129  0.0000

fst.dist <- round(as.dist(ord.gen), 3)
fst.dist

##         Austria Germany Denmark Spain Finland France Russia Turkey
## Germany   0.013                                                   
## Denmark   0.018   0.017                                           
## Spain     0.016   0.008   0.026                                   
## Finland   0.015   0.014   0.025 0.020                             
## France    0.021   0.010   0.019 0.016   0.022                     
## Russia    0.017   0.021   0.023 0.032   0.019  0.022              
## Turkey    0.014   0.019   0.022 0.026   0.016  0.028  0.017       
## Ukraine   0.014   0.020   0.023 0.031   0.021  0.028  0.007  0.013

Dgeo

##         Austria Germany Denmark  Spain Finland France Russia Turkey
## Germany   6.251                                                    
## Denmark   9.578   7.762                                            
## Spain    16.171  10.761  16.969                                    
## Finland  14.874  18.895  14.271 29.633                             
## France   12.502   6.313  11.265  5.807  24.529                     
## Russia   19.826  25.490  23.116 35.967  10.190 31.719              
## Turkey   18.178  23.912  27.074 31.008  22.625 29.473 17.820       
## Ukraine  14.250  20.491  20.977 29.962  13.392 26.749  9.085  9.477

obj

##                           x        y
## Austria           15.965000 48.28750
## Esparto         -122.046000 38.68000
## Tuolumne        -120.260000 37.97000
## Germany            9.714757 48.19901
## Denmark           10.212586 55.94513
## Spain              1.279236 41.51821
## Finland           23.520000 61.10000
## France             3.544205 46.86567
## NY-MA            -74.085000 42.44500
## MI-WI            -88.049150 42.61055
## PA               -76.635001 40.33670
## Russia            33.235277 58.02444
## Turkey            32.260328 40.23144
## Ukraine           30.165994 49.47388
## Charlottesville  -78.470000 38.00000

dprec <- obj[!rownames(obj) %in% USA, ]
dprec

##                 x        y
## Austria 15.965000 48.28750
## Germany  9.714757 48.19901
## Denmark 10.212586 55.94513
## Spain    1.279236 41.51821
## Finland 23.520000 61.10000
## France   3.544205 46.86567
## Russia  33.235277 58.02444
## Turkey  32.260328 40.23144
## Ukraine 30.165994 49.47388

files <- list.files("/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio", full.names = TRUE)
files

##  [1] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_10.tif"
##  [2] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_11.tif"
##  [3] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_14.tif"
##  [4] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_15.tif"
##  [5] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_16.tif"
##  [6] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_18.tif"
##  [7] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_19.tif"
##  [8] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_2.tif" 
##  [9] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_3.tif" 
## [10] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_4.tif" 
## [11] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_5.tif" 
## [12] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_6.tif" 
## [13] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_8.tif" 
## [14] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_9.tif"

pre_sea <- stack(files[4])

coord <- dprec[ , c(1, 2)]
head(coord)

##                 x        y
## Austria 15.965000 48.28750
## Germany  9.714757 48.19901
## Denmark 10.212586 55.94513
## Spain    1.279236 41.51821
## Finland 23.520000 61.10000
## France   3.544205 46.86567

prec.season <- extract(pre_sea, coord)

colnames(prec.season) <- c("precseason")
rownames(prec.season) <- rownames(coord)
head(prec.season)

##         precseason
## Austria   37.24262
## Germany   30.28178
## Denmark   21.50687
## Spain     25.15103
## Finland   33.33337
## France    14.22891

prec <- round(dist(prec.season), 3)
prec

##         Austria Germany Denmark  Spain Finland France Russia Turkey
## Germany   6.961                                                    
## Denmark  15.736   8.775                                            
## Spain    12.092   5.131   3.644                                    
## Finland   3.909   3.052  11.826  8.182                             
## France   23.014  16.053   7.278 10.922  19.104                     
## Russia    7.163   0.202   8.572  4.928   3.254 15.850              
## Turkey    4.781  11.741  20.516 16.872   8.690 27.794 11.944       
## Ukraine   1.902   5.059  13.833 10.189   2.007 21.111  5.261  6.683

## R script for 'MMRR' function that performs Multiple Matrix Regression with Randomization analysis.

source("MMRR.R")

distances <- list(Dgeo = as.matrix(Dgeo), dprec = as.matrix(prec))

MMRR(as.matrix(fst.dist), distances)

## $r.squared
## r.squared 
## 0.5393563 
## 
## $coefficients
##    Intercept         Dgeo        dprec 
## 0.0089133784 0.0004727619 0.0001568212 
## 
## $tstatistic
## Intercept(t)      Dgeo(t)     dprec(t) 
##     4.886716     5.430338     1.433680 
## 
## $tpvalue
## Intercept(p)      Dgeo(p)     dprec(p) 
##        0.999        0.001        0.213 
## 
## $Fstatistic
## F-statistic 
##    19.31944 
## 
## $Fpvalue
## F p-value 
##     0.001

## partial mantel test

part.mant <- mantel.partial(fst.dist, prec, Dgeo, method = "pearson", permutations = 1000)
part.mant

## 
## Partial Mantel statistic based on Pearson's product-moment correlation 
## 
## Call:
## mantel.partial(xdis = fst.dist, ydis = prec, zdis = Dgeo, method = "pearson",      permutations = 1000) 
## 
## Mantel statistic r: 0.2421 
##       Significance: 0.086913 
## 
## Upper quantiles of permutations (null model):
##   90%   95% 97.5%   99% 
## 0.223 0.294 0.340 0.402 
## Permutation: free
## Number of permutations: 1000

part.mant2 <- mantel.partial(fst.dist, Dgeo, prec, method = "pearson", permutations = 1000)
part.mant2

## 
## Partial Mantel statistic based on Pearson's product-moment correlation 
## 
## Call:
## mantel.partial(xdis = fst.dist, ydis = Dgeo, zdis = prec, method = "pearson",      permutations = 1000) 
## 
## Mantel statistic r: 0.687 
##       Significance: 0.000999 
## 
## Upper quantiles of permutations (null model):
##   90%   95% 97.5%   99% 
## 0.229 0.302 0.365 0.421 
## Permutation: free
## Number of permutations: 1000

## mantel test

ibd <- mantel.randtest(fst.dist, Dgeo)
ibd

## Monte-Carlo test
## Call: mantel.randtest(m1 = fst.dist, m2 = Dgeo)
## 
## Observation: 0.714608 
## 
## Based on 999 replicates
## Simulated p-value: 0.001 
## Alternative hypothesis: greater 
## 
##      Std.Obs  Expectation     Variance 
##  4.036339334 -0.001721128  0.031495598

quartz()
plot(ibd, las = 1, main = "") ## isolation by distance is clearly significant

mantCor <- mgram(fst.dist, Dgeo, nperm = 1000)
mantCor

## $mgram
##            lag ngroup     mantelr  pval        llim       ulim
## [1,]  2.513333      0  0.00000000 1.000  0.00000000  0.0000000
## [2,]  7.540000      7  0.48955228 0.006  0.38378455  0.6432192
## [3,] 12.566667      8  0.13723874 0.427 -0.07570513  0.2660222
## [4,] 17.593333      6  0.14671721 0.402  0.01044418  0.3113308
## [5,] 22.620000      6 -0.09568514 0.596 -0.22013509  0.0264787
## [6,] 27.646667      6 -0.44015164 0.005 -0.63975763 -0.2105894
## 
## $resids
## [1] NA
## 
## attr(,"class")
## [1] "mgram"

mantCor2 <- mgram(fst.dist, prec, nperm = 1000)
mantCor2

## $mgram
##            lag ngroup     mantelr  pval        llim          ulim
## [1,]  2.299333      7  0.05706438 0.749 -0.08101704  1.885557e-01
## [2,]  6.898000     13  0.30933846 0.058  0.10874731  5.018355e-01
## [3,] 11.496667      6 -0.10844316 0.540 -0.30876377  1.197642e-01
## [4,] 16.095333      5 -0.03780823 0.848 -0.21908220  1.374838e-01
## [5,] 20.694000      3 -0.24514145 0.156 -0.40650049 -1.204746e-02
## [6,] 25.292667      1 -0.05063182 0.801 -0.11499191  5.459326e-17
## 
## $resids
## [1] NA
## 
## attr(,"class")
## [1] "mgram"

plot(mantCor, las = 1)

plot(mantCor2, las = 1)

## filled dots indicate significant correlations. The Mantel correlogram shows that genotypes are relatively similar at short distance, while this similarity decreases with distance. The negative correlations indicate genetic differenciation between some populations


dens <- MASS::kde2d(as.vector(Dgeo), as.vector(fst.dist), n = 300)
myPal <- colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(as.vector(Dgeo), as.vector(fst.dist), pch = 20, xlab = "Geographic Distance", ylab = "Genetic Distance", las = 1)
image(dens, col = transp(myPal(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(Dgeo)), lwd = 2)

#lines(loess.smooth(as.vector(Dgeo), as.vector(fst.dist)), col = "red", lwd = 2)

dens2 <- MASS::kde2d(as.vector(prec), as.vector(fst.dist), n = 300)
myPal2 <- colorRampPalette(c("white", "blue", "gold", "orange", "red"))
plot(as.vector(prec), as.vector(fst.dist), pch = 20, xlab = "Precipitation distance", ylab = "Genetic Distance", las = 1)
image(dens2, col = transp(myPal2(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(prec)), lwd = 2)

#lines(loess.smooth(as.vector(prec), as.vector(fst.dist)), col = "red", lwd = 2)


quartz()

layout(matrix(c(1, 1, 2, 2,
                1, 1, 2, 2,
                3, 3, 4, 4,
                3, 3, 4, 4), nrow = 4, ncol = 4, byrow = TRUE))

plot(lm(as.vector(fst.dist) ~ as.vector(prec)), las = 1)

quartz()

layout(matrix(c(1, 1, 2, 2,
                1, 1, 2, 2,
                3, 3, 4, 4,
                3, 3, 4, 4), nrow = 4, ncol = 4, byrow = TRUE))

plot(lm(as.vector(fst.dist) ~ as.vector(Dgeo)), las = 1)

## png("distance.revision2.png", width = 7, height = 7, units = "in", res = 300)

## pdf("distance.revision2.pdf")

layout(matrix(c(0, 1, 1, 0,
                0, 1, 1, 0,
                0, 2, 2, 0,
                0, 2, 2, 0), nrow = 4, ncol = 4, byrow = TRUE))

plot(as.vector(Dgeo), as.vector(fst.dist), pch = 20, xlab = "Geographic Distance", ylab = "Genetic Distance", las = 1, cex.lab = 1.4)
image(dens, col = transp(myPal(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(Dgeo)), lwd = 2)
#lines(loess.smooth(as.vector(Dgeo), as.vector(fst.dist)), col = "red", lwd = 2)
mtext("(a)", side = 2, at = 0.035, line = 2.6, font = 2, family = "serif", las = 1)

plot(as.vector(prec), as.vector(fst.dist), pch = 20, xlab = "Precipitation distance", ylab = "Genetic Distance", las = 1, cex.lab = 1.4)
image(dens2, col = transp(myPal2(300), 0.7), add = TRUE)
abline(lm(as.vector(fst.dist) ~ as.vector(prec)), lwd = 2)
#lines(loess.smooth(as.vector(prec), as.vector(fst.dist)), col = "red", lwd = 2)
mtext("(b)", side = 2, at = 0.035, line = 3.1, font = 2, family = "serif", las = 1)

## dev.off()

Figure 6. Patterns of isolation by distance and isolation by environment according to Multiple Matrix Regression with Randomization analysis (MMRR). (a-b) Geographic and precipitation distances as a function of genetic distance in European populations of D. melanogaster. Colors represent estimated probability densities.

Analysis of seasonlity and adaptation test

## extracting environmental variables from WorldClim and other sources


mat_adap <- admix[ , c(1, 2, 3, 4)]
head(mat_adap)

##   country   long     lat             Ind
## 1 Austria 15.965 48.2875   AT_gr_12_fall
## 2 Austria 15.965 48.2875 AT_gr_12_spring
## 3 Austria 15.965 48.2875    AT_Mau_14_01
## 4 Austria 15.965 48.2875    AT_Mau_14_02
## 5 Austria 15.965 48.2875    AT_Mau_15_50
## 6 Austria 15.965 48.2875    AT_Mau_15_51

mat_adap$season <- rep(NA, nrow(mat_adap))

idx <- 1
for(i in mat_adap$Ind){
    obj <- unique(afis$season[afis$sampleId == i])
    mat_adap[idx, 5] <- obj
    idx = idx + 1
}

mat_adap$season[mat_adap$season == "frost"] <- "fall"
head(mat_adap)

##   country   long     lat             Ind season
## 1 Austria 15.965 48.2875   AT_gr_12_fall   fall
## 2 Austria 15.965 48.2875 AT_gr_12_spring spring
## 3 Austria 15.965 48.2875    AT_Mau_14_01 spring
## 4 Austria 15.965 48.2875    AT_Mau_14_02   fall
## 5 Austria 15.965 48.2875    AT_Mau_15_50 spring
## 6 Austria 15.965 48.2875    AT_Mau_15_51   fall

## extracting NPP


NPP_file <- list.files("/Users/dpadil10/Dropbox (ASU)/npp-geotiff", full.names = TRUE)
NPP_file

## [1] "/Users/dpadil10/Dropbox (ASU)/npp-geotiff/npp_geotiff.tif"

NPP.data <- stack(NPP_file)

prod <- mat_adap[ , c(2, 3)] ## loading the dataframe with the coordinates
head(prod)

##     long     lat
## 1 15.965 48.2875
## 2 15.965 48.2875
## 3 15.965 48.2875
## 4 15.965 48.2875
## 5 15.965 48.2875
## 6 15.965 48.2875

NPP.data.ext <- extract(NPP.data, prod)
head(NPP.data.ext)

##       npp_geotiff
## [1,] 264522317824
## [2,] 264522317824
## [3,] 264522317824
## [4,] 264522317824
## [5,] 264522317824
## [6,] 264522317824

NPP.data.ext <- as.data.frame(NPP.data.ext)
colnames(NPP.data.ext) <- "NPP"
head(NPP.data.ext)

##            NPP
## 1 264522317824
## 2 264522317824
## 3 264522317824
## 4 264522317824
## 5 264522317824
## 6 264522317824

##is.na(NPP.data.ext)


## extracting BIO15 precipitation seasonality (coefficient of variation), BIO4 temperature seasonality (standard deviation * 100)

files <- list.files("/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio", full.names = TRUE)
files

##  [1] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_10.tif"
##  [2] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_11.tif"
##  [3] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_14.tif"
##  [4] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_15.tif"
##  [5] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_16.tif"
##  [6] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_18.tif"
##  [7] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_19.tif"
##  [8] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_2.tif" 
##  [9] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_3.tif" 
## [10] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_4.tif" 
## [11] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_5.tif" 
## [12] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_6.tif" 
## [13] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_8.tif" 
## [14] "/Users/dpadil10/Dropbox (ASU)/Padilla et al/wc2.1_30s_bio/wc2.1_30s_bio_9.tif"

pre_sea <- stack(files[4])
tem_sea <- stack(files[10])

coord <- mat_adap[ , c(2, 3)]
head(coord)

##     long     lat
## 1 15.965 48.2875
## 2 15.965 48.2875
## 3 15.965 48.2875
## 4 15.965 48.2875
## 5 15.965 48.2875
## 6 15.965 48.2875

prec.season <- extract(pre_sea, coord)
temp.season <- extract(tem_sea, coord)

variables <- cbind(prec.season, temp.season, log(NPP.data.ext))
head(variables)

##   wc2.1_30s_bio_15 wc2.1_30s_bio_4      NPP
## 1         37.24262        744.4373 26.30119
## 2         37.24262        744.4373 26.30119
## 3         37.24262        744.4373 26.30119
## 4         37.24262        744.4373 26.30119
## 5         37.24262        744.4373 26.30119
## 6         37.24262        744.4373 26.30119

colnames(variables) <- c("precseason", "tempseason", "logNPP")
head(variables)

##   precseason tempseason   logNPP
## 1   37.24262   744.4373 26.30119
## 2   37.24262   744.4373 26.30119
## 3   37.24262   744.4373 26.30119
## 4   37.24262   744.4373 26.30119
## 5   37.24262   744.4373 26.30119
## 6   37.24262   744.4373 26.30119

env <- cbind(mat_adap, variables)
head(env)

##   country   long     lat             Ind season precseason tempseason   logNPP
## 1 Austria 15.965 48.2875   AT_gr_12_fall   fall   37.24262   744.4373 26.30119
## 2 Austria 15.965 48.2875 AT_gr_12_spring spring   37.24262   744.4373 26.30119
## 3 Austria 15.965 48.2875    AT_Mau_14_01 spring   37.24262   744.4373 26.30119
## 4 Austria 15.965 48.2875    AT_Mau_14_02   fall   37.24262   744.4373 26.30119
## 5 Austria 15.965 48.2875    AT_Mau_15_50 spring   37.24262   744.4373 26.30119
## 6 Austria 15.965 48.2875    AT_Mau_15_51   fall   37.24262   744.4373 26.30119

## PCA of environmental variables

x <- tab(fly_gen3, NA.method = "mean")

snppca <- as.data.frame(x)
snppca$Ind <- rownames(x)
snppca$country <- fly_gen3$pop
dim(snppca)

## [1]  151 3102

gen <- snppca[order(snppca$country), ]

merging <- merge(gen, env, by = "Ind")
merging[1:5, 1:5]

##               Ind 162333.A 162333.C 162335.C 162335.T
## 1   AT_gr_12_fall        1        1        1        1
## 2 AT_gr_12_spring        1        1        1        1
## 3    AT_Mau_14_01        0        2        1        1
## 4    AT_Mau_14_02        1        1        1        1
## 5    AT_Mau_15_50        1        1        1        1

rownames(merging) <- merging[ , 1]
dim(merging)

## [1]  151 3109

merging[1:5, 3101:3109]

##                 163997.T country.x country.y   long     lat season precseason
## AT_gr_12_fall          0   Austria   Austria 15.965 48.2875   fall   37.24262
## AT_gr_12_spring        0   Austria   Austria 15.965 48.2875 spring   37.24262
## AT_Mau_14_01           0   Austria   Austria 15.965 48.2875 spring   37.24262
## AT_Mau_14_02           1   Austria   Austria 15.965 48.2875   fall   37.24262
## AT_Mau_15_50           0   Austria   Austria 15.965 48.2875 spring   37.24262
##                 tempseason   logNPP
## AT_gr_12_fall     744.4373 26.30119
## AT_gr_12_spring   744.4373 26.30119
## AT_Mau_14_01      744.4373 26.30119
## AT_Mau_14_02      744.4373 26.30119
## AT_Mau_15_50      744.4373 26.30119

merging <- merging[ , -c(1, 3102:3109)]
dim(merging)

## [1]  151 3100

merging[1:5, 3090:3100]

##                 163992.G 163993.T 163993.C 163994.G 163994.A 163995.G 163995.A
## AT_gr_12_fall          0        1        1        1        1        2        0
## AT_gr_12_spring        0        1        1        1        1        2        0
## AT_Mau_14_01           1        1        1        1        1        2        0
## AT_Mau_14_02           0        1        1        1        1        2        0
## AT_Mau_15_50           1        1        1        1        1        2        0
##                 163996.G 163996.C 163997.A 163997.T
## AT_gr_12_fall          1        1        2        0
## AT_gr_12_spring        1        1        2        0
## AT_Mau_14_01           1        1        2        0
## AT_Mau_14_02           1        1        1        1
## AT_Mau_15_50           1        1        2        0

genomic <- merging

#env$Ind <- as.factor(env$Ind)
#env$Ind <- as.character(levels(env$Ind))
dim(snppca)

## [1]  151 3102

snppca[1:5, 1:5]

##                 162333.A 162333.C 162335.C 162335.T 162336.T
## AT_gr_12_fall          1        1        1        1        2
## AT_gr_12_spring        1        1        1        1        2
## AT_Mau_14_01           0        2        1        1        1
## AT_Mau_14_02           1        1        1        1        2
## AT_Mau_15_50           1        1        1        1        2

snppca[1:5, 3090:3102]

##                 163992.G 163993.T 163993.C 163994.G 163994.A 163995.G 163995.A
## AT_gr_12_fall          0        1        1        1        1        2        0
## AT_gr_12_spring        0        1        1        1        1        2        0
## AT_Mau_14_01           1        1        1        1        1        2        0
## AT_Mau_14_02           0        1        1        1        1        2        0
## AT_Mau_15_50           1        1        1        1        1        2        0
##                 163996.G 163996.C 163997.A 163997.T             Ind country
## AT_gr_12_fall          1        1        2        0   AT_gr_12_fall Austria
## AT_gr_12_spring        1        1        2        0 AT_gr_12_spring Austria
## AT_Mau_14_01           1        1        2        0    AT_Mau_14_01 Austria
## AT_Mau_14_02           1        1        1        1    AT_Mau_14_02 Austria
## AT_Mau_15_50           1        1        2        0    AT_Mau_15_50 Austria

snppca <- snppca[ , -c(3101, 3102)]
dim(snppca)

## [1]  151 3100

snppca[1:5, 3090:3100]

##                 163992.G 163993.T 163993.C 163994.G 163994.A 163995.G 163995.A
## AT_gr_12_fall          0        1        1        1        1        2        0
## AT_gr_12_spring        0        1        1        1        1        2        0
## AT_Mau_14_01           1        1        1        1        1        2        0
## AT_Mau_14_02           0        1        1        1        1        2        0
## AT_Mau_15_50           1        1        1        1        1        2        0
##                 163996.G 163996.C 163997.A 163997.T
## AT_gr_12_fall          1        1        2        0
## AT_gr_12_spring        1        1        2        0
## AT_Mau_14_01           1        1        2        0
## AT_Mau_14_02           1        1        1        1
## AT_Mau_15_50           1        1        2        0

genomic <- genomic[env[ , 4], ]

identical(rownames(genomic), env[ , 4]) ## genotypes and environmental data are in the same order

## [1] TRUE

## How many seasons per country are there?

table(data.frame(season = env$season, country = env$country))

##         country
## season   Austria Charlottesville Denmark Esparto Finland France Germany MI-WI
##   fall         3               3       4       3       2      4       6     5
##   spring       3               3       1       2       4      6       6     4
##         country
## season   NY-MA PA Russia Spain Tuolumne Turkey Ukraine
##   fall       3  9      3     4        4      4      14
##   spring     3  8      3     3        1      9      24

xtable(table(data.frame(season = env$season, country = env$country)), caption = "caption here")

## % latex table generated in R 4.2.2 by xtable 1.8-4 package
## % Wed Nov 15 23:10:37 2023
## \begin{table}[ht]
## \centering
## \begin{tabular}{rrrrrrrrrrrrrrrr}
##   \hline
##  & Austria & Charlottesville & Denmark & Esparto & Finland & France & Germany & MI-WI & NY-MA & PA & Russia & Spain & Tuolumne & Turkey & Ukraine \\ 
##   \hline
## fall &   3 &   3 &   4 &   3 &   2 &   4 &   6 &   5 &   3 &   9 &   3 &   4 &   4 &   4 &  14 \\ 
##   spring &   3 &   3 &   1 &   2 &   4 &   6 &   6 &   4 &   3 &   8 &   3 &   3 &   1 &   9 &  24 \\ 
##    \hline
## \end{tabular}
## \caption{caption here} 
## \end{table}

names(env)

## [1] "country"    "long"       "lat"        "Ind"        "season"    
## [6] "precseason" "tempseason" "logNPP"

## CCA

gen.cca <- cca(genomic ~ season*country, env)
gen.cca

## Call: cca(formula = genomic ~ season * country, data = env)
## 
##               Inertia Proportion Rank
## Total         0.21786    1.00000     
## Constrained   0.07607    0.34917   29
## Unconstrained 0.14179    0.65083  121
## Inertia is scaled Chi-square 
## 
## Eigenvalues for constrained axes:
##     CCA1     CCA2     CCA3     CCA4     CCA5     CCA6     CCA7     CCA8 
## 0.027197 0.006577 0.003781 0.003062 0.002942 0.002595 0.002390 0.002168 
##     CCA9    CCA10    CCA11    CCA12    CCA13    CCA14    CCA15    CCA16 
## 0.002039 0.001935 0.001810 0.001709 0.001580 0.001386 0.001334 0.001266 
##    CCA17    CCA18    CCA19    CCA20    CCA21    CCA22    CCA23    CCA24 
## 0.001217 0.001095 0.001071 0.001057 0.001016 0.000947 0.000919 0.000913 
##    CCA25    CCA26    CCA27    CCA28    CCA29 
## 0.000874 0.000861 0.000821 0.000788 0.000720 
## 
## Eigenvalues for unconstrained axes:
##      CA1      CA2      CA3      CA4      CA5      CA6      CA7      CA8 
## 0.006362 0.003602 0.003278 0.003023 0.002933 0.002655 0.002486 0.002442 
## (Showing 8 of 121 unconstrained eigenvalues)

anova(gen.cca)

## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = genomic ~ season * country, data = env)
##           Df ChiSquare      F Pr(>F)    
## Model     29  0.076071 2.2385  0.001 ***
## Residual 121  0.141792                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

env$country <- as.factor(env$country)
levels(env$country)

##  [1] "Austria"         "Charlottesville" "Denmark"         "Esparto"        
##  [5] "Finland"         "France"          "Germany"         "MI-WI"          
##  [9] "NY-MA"           "PA"              "Russia"          "Spain"          
## [13] "Tuolumne"        "Turkey"          "Ukraine"

cols <- env$country
cols <- as.numeric(cols)

ncol <- nPop(fly_gen3)
palette <- distinctColorPalette(ncol)

colors <- rep()

for(i in cols){
    col <- palette[i]
    colors <- c(colors, col)
}

colors

##   [1] "#BD4BDB" "#BD4BDB" "#BD4BDB" "#BD4BDB" "#BD4BDB" "#BD4BDB" "#E08851"
##   [8] "#E08851" "#E08851" "#E08851" "#E08851" "#E08851" "#E2AAA2" "#E2AAA2"
##  [15] "#E2AAA2" "#E2AAA2" "#E2AAA2" "#D49EDA" "#D49EDA" "#D49EDA" "#D49EDA"
##  [22] "#D49EDA" "#DE609B" "#DE609B" "#DE609B" "#DE609B" "#DE609B" "#DE609B"
##  [29] "#7776D8" "#7776D8" "#7776D8" "#7776D8" "#7776D8" "#7776D8" "#7776D8"
##  [36] "#7776D8" "#7776D8" "#7776D8" "#85E2A0" "#85E2A0" "#85E2A0" "#85E2A0"
##  [43] "#85E2A0" "#85E2A0" "#85E2A0" "#85E2A0" "#85E2A0" "#85E2A0" "#85E2A0"
##  [50] "#85E2A0" "#D8CC6A" "#D8CC6A" "#D8CC6A" "#D8CC6A" "#D8CC6A" "#D8CC6A"
##  [57] "#D8CC6A" "#D8CC6A" "#D8CC6A" "#757C7A" "#757C7A" "#757C7A" "#757C7A"
##  [64] "#757C7A" "#757C7A" "#CFDEB5" "#CFDEB5" "#CFDEB5" "#CFDEB5" "#CFDEB5"
##  [71] "#CFDEB5" "#CFDEB5" "#CFDEB5" "#CFDEB5" "#CFDEB5" "#CFDEB5" "#CFDEB5"
##  [78] "#CFDEB5" "#CFDEB5" "#CFDEB5" "#CFDEB5" "#CFDEB5" "#7DB0DE" "#7DB0DE"
##  [85] "#7DB0DE" "#7DB0DE" "#7DB0DE" "#7DB0DE" "#72E95E" "#72E95E" "#72E95E"
##  [92] "#72E95E" "#72E95E" "#72E95E" "#72E95E" "#75DED6" "#75DED6" "#75DED6"
##  [99] "#75DED6" "#75DED6" "#CCD6DF" "#CCD6DF" "#CCD6DF" "#CCD6DF" "#CCD6DF"
## [106] "#CCD6DF" "#CCD6DF" "#CCD6DF" "#CCD6DF" "#CCD6DF" "#CCD6DF" "#CCD6DF"
## [113] "#CCD6DF" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C"
## [120] "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C"
## [127] "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C"
## [134] "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C"
## [141] "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C"
## [148] "#C8E34C" "#C8E34C" "#C8E34C" "#C8E34C"

## png("cca_revision3.png", width = 7, height = 7, units = "in", res = 300)

## pdf("cca_revision3.pdf")

plot(gen.cca, display = "sites", type = "n", las = 1)
with(gen.cca, points(gen.cca, display = "sites", pch = 21, bg = colors, col = colors))
##with(env[3], ordiellipse(gen.cca, env$country, kind = "se", conf = 0.95, col = "gray"))
##with(env[3], ordihull(gen.cca, env$country, col = colors, lty = 2))
with(env[3], ordispider(gen.cca, env$country, col = unique(colors), label = TRUE))

##legend("topright", legend = levels(env$country), pch = 19, col = unique(cols), bty = "n", cex = 0.8)

## dev.off()

Figure 4. Constrained correspondence analysis depicting the difference in allele frequency across populations mediated by the interaction between season and locality. Filled dots represent pooled samples and “spider” diagrams connect the pools to the population centroid.

## Detecting selection

load("/Users/dpadil10/Dropbox (ASU)/foraging_mode_Drosophila/data_analysis/chr2-scan/fly.pv1.prec.chr2.RData")
load("/Users/dpadil10/Dropbox (ASU)/foraging_mode_Drosophila/data_analysis/chr2-scan/fly.pv1.temp.chr2.RData")
load("/Users/dpadil10/Dropbox (ASU)/foraging_mode_Drosophila/data_analysis/chr2-scan/fly.pv1.npp.chr2.RData")


fly.pv1.prec.chr2 <- fly.pv1.prec
fly.pv1.temp.chr2 <- fly.pv1.temp
fly.pv1.npp.chr2 <- fly.pv1.npp

rm(fly.pv1.prec)
rm(fly.pv1.temp)
rm(fly.pv1.npp)

head(env)

##   country   long     lat             Ind season precseason tempseason   logNPP
## 1 Austria 15.965 48.2875   AT_gr_12_fall   fall   37.24262   744.4373 26.30119
## 2 Austria 15.965 48.2875 AT_gr_12_spring spring   37.24262   744.4373 26.30119
## 3 Austria 15.965 48.2875    AT_Mau_14_01 spring   37.24262   744.4373 26.30119
## 4 Austria 15.965 48.2875    AT_Mau_14_02   fall   37.24262   744.4373 26.30119
## 5 Austria 15.965 48.2875    AT_Mau_15_50 spring   37.24262   744.4373 26.30119
## 6 Austria 15.965 48.2875    AT_Mau_15_51   fall   37.24262   744.4373 26.30119

##env.pca <- rda(env[ , c(2, 3, 6, 7, 8)], scale = T)
env.pca <- rda(env[ , c(6, 7, 8)], scale = TRUE)
summary(env.pca)$cont

## $importance
## Importance of components:
##                          PC1    PC2    PC3
## Eigenvalue            1.2227 0.9646 0.8127
## Proportion Explained  0.4076 0.3215 0.2709
## Cumulative Proportion 0.4076 0.7291 1.0000

par(las = 1)
screeplot(env.pca, bstick = TRUE, type = "lines", las = 1, main = "")

round(scores(env.pca, choices = 1:3, display = "species", scaling = 0), digits = 3)

##               PC1    PC2    PC3
## precseason -0.655  0.273 -0.704
## tempseason  0.658 -0.251 -0.710
## logNPP      0.371  0.929  0.016
## attr(,"const")
## [1] 4.605779

#pred.PC2 <- scores(env.pca, choices = 2, display = "sites", scaling = 0)

##pred.PC1 <- scores(env.pca, choices = 1:2, display = "sites", scaling = 0) ## reviewer 3 suggested to use PC1 instead

K <- 9

fly.lfmm1.prec <- lfmm_ridge(Y = snppca, X = env$precseason, K = K) ## change K as you see fit
fly.lfmm1.temp <- lfmm_ridge(Y = snppca, X = env$tempseason, K = K) ## change K as you see fit
fly.lfmm1.npp <- lfmm_ridge(Y = snppca, X = env$logNPP, K = K) ## change K as you see fit

##fly.lfmm1 <- lfmm_ridge(Y = snppca, X = pred.PC1, K = K) ## using PC1 as suggested by reviewer 3

fly.pv1.prec <- lfmm_test(Y = snppca, X = env$precseason, lfmm = fly.lfmm1.prec, calibrate = "gif")
fly.pv1.temp <- lfmm_test(Y = snppca, X = env$tempseason, lfmm = fly.lfmm1.temp, calibrate = "gif")
fly.pv1.npp <- lfmm_test(Y = snppca, X = env$logNPP, lfmm = fly.lfmm1.npp, calibrate = "gif")

##fly.pv1 <- lfmm_test(Y = snppca, X = pred.PC1, lfmm = fly.lfmm1, calibrate = "gif") ## using PC1 as suggested by reviewer 3

names(fly.pv1.prec) # this object includes raw z-scores and p-values, as well as GIF-calibrated scores and p-values

## [1] "B"                 "epsilon.sigma2"    "B.sigma2"         
## [4] "score"             "pvalue"            "gif"              
## [7] "calibrated.score2" "calibrated.pvalue"

names(fly.pv1.prec.chr2)

## [1] "B"                 "epsilon.sigma2"    "B.sigma2"         
## [4] "score"             "pvalue"            "gif"              
## [7] "calibrated.score2" "calibrated.pvalue"

## let’s look at the genomic inflation factor (GIF)

fly.pv1.prec$gif

## [1] 1.019627

fly.pv1.temp$gif

## [1] 2.071159

fly.pv1.npp$gif

## [1] 1.066583

fly.pv1.prec.chr2$gif

## [1] 1.294608

fly.pv1.temp.chr2$gif

## [1] 1.79341

fly.pv1.npp.chr2$gif

## [1] 1.191985

## an appropriately calibrated set of tests will have a GIF of around 1

## let’s look at how application of the GIF to the p-values impacts the p-value distribution


quartz()
layout(matrix(c(1, 1, 2, 2,
                1, 1, 2, 2,
                3, 3, 4, 4,
                3, 3, 4, 4), nrow = 4, ncol = 4, byrow = FALSE))

par(mar = c(5, 5, 1, 1), las = 1)

hist(fly.pv1.prec$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "for", ylim = c(0, 350))
legend("top", "K = 9", bty = "n")
hist(fly.pv1.prec$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 400))
legend("top", "K = 9", bty = "n")

hist(fly.pv1.prec.chr2$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "Chr 2L", ylim = c(0, 120000))
legend("top", "K = 8", bty = "n")
hist(fly.pv1.prec.chr2$pvalue[ , 1], xlab = "Unadjusted p-values", las = 1, main = "", ylim = c(0, 140000))
legend("top", "K = 8", bty = "n")

## convert the adjusted p-values to q-values. q-values provide a measure of each SNP’s significance, automatically taking into account the fact that thousands are simultaneously being tested. I can then use an FDR threshold to control the number of false positive detections (given that the p-value distribution is “well-behaved”)

fly.qv1.prec <- qvalue(fly.pv1.prec$calibrated.pvalue)$qvalues
fly.qv1.temp <- qvalue(fly.pv1.temp$calibrated.pvalue)$qvalues
fly.qv1.npp <- qvalue(fly.pv1.npp$calibrated.pvalue)$qvalues

length(which(fly.qv1.prec < 0.001)) ## how many SNPs have an FDR < 0.1%?

## [1] 6

length(which(fly.qv1.temp < 0.001)) ## how many SNPs have an FDR < 0.1%?

## [1] 0

length(which(fly.qv1.npp < 0.001)) ## how many SNPs have an FDR < 0.1%?

## [1] 0

fly.qv1.prec.chr2 <- qvalue(fly.pv1.prec.chr2$calibrated.pvalue)$qvalues
fly.qv1.temp.chr2 <- qvalue(fly.pv1.temp.chr2$calibrated.pvalue)$qvalues
fly.qv1.npp.chr2 <- qvalue(fly.pv1.npp.chr2$calibrated.pvalue)$qvalues

length(which(fly.qv1.prec.chr2 < 0.001)) ## how many SNPs have an FDR < 0.1%?

## [1] 104

length(which(fly.qv1.temp.chr2 < 0.001)) ## how many SNPs have an FDR < 0.1%?

## [1] 4

length(which(fly.qv1.npp.chr2 < 0.001)) ## how many SNPs have an FDR < 0.1%?

## [1] 32

## using K = 9, the default GIF correction, and an FDR threshold of 0.001, I detected 2 candidate SNPs under selection in response to the PC2 environmental predictor

(fly.FDR.prec <- colnames(snppca)[which(fly.qv1.prec < 0.001)]) ## identify which SNPs these are

## [1] "162369.T" "162369.C" "163496.G" "163496.A" "163504.G" "163504.A"

(fly.FDR.temp <- colnames(snppca)[which(fly.qv1.temp < 0.001)]) ## identify which SNPs these are

## character(0)

(fly.FDR.npp <- colnames(snppca)[which(fly.qv1.npp < 0.001)]) ## identify which SNPs these are

## character(0)

dim(snppca)

## [1]  151 3100

rm(snppca)

load("/Users/dpadil10/Dropbox (ASU)/foraging_mode_Drosophila/data_analysis/chr2-scan/snppca_chr2.RData")

dim(snppca)

## [1]     151 1862322

(fly.FDR.prec.chr2 <- colnames(snppca)[which(fly.qv1.prec.chr2 < 0.001)]) ## identify which SNPs these are

##   [1] "4341.A"   "4341.T"   "20722.C"  "20722.T"  "24540.T"  "24540.A" 
##   [7] "42028.C"  "42028.T"  "49396.C"  "49396.G"  "71603.T"  "71603.C" 
##  [13] "79614.A"  "79614.G"  "143754.T" "143754.C" "144065.T" "144065.C"
##  [19] "163496.G" "163496.A" "281854.G" "281854.C" "339746.G" "339746.A"
##  [25] "358648.C" "358648.T" "359700.C" "359700.G" "359838.G" "359838.C"
##  [31] "359851.G" "359851.A" "365020.C" "365020.T" "428037.C" "428037.T"
##  [37] "435493.C" "435493.A" "436264.C" "436264.T" "460821.T" "460821.A"
##  [43] "461800.G" "461800.A" "479192.C" "479192.G" "542516.T" "542516.C"
##  [49] "549898.C" "549898.G" "576471.T" "576471.C" "592567.A" "592567.G"
##  [55] "638134.C" "638134.A" "658660.G" "658660.A" "659375.A" "659375.G"
##  [61] "693165.G" "693165.A" "693375.T" "693375.G" "707347.A" "707347.G"
##  [67] "713129.A" "713129.G" "716215.G" "716215.A" "721348.A" "721348.T"
##  [73] "726550.A" "726550.G" "737646.G" "737646.T" "766843.G" "766843.C"
##  [79] "776548.G" "776548.A" "817451.T" "817451.A" "817454.A" "817454.G"
##  [85] "835825.C" "835825.T" "844869.A" "844869.C" "845091.C" "845091.G"
##  [91] "918074.G" "918074.A" "927644.T" "927644.C" "943283.G" "943283.T"
##  [97] "952506.C" "952506.T" "967638.G" "967638.A" "973978.G" "973978.A"
## [103] "975445.G" "975445.A"

(fly.FDR.temp.chr2 <- colnames(snppca)[which(fly.qv1.temp.chr2 < 0.001)]) ## identify which SNPs these are

## [1] "555268.G" "555268.T" "571263.A" "571263.G"

(fly.FDR.temp.chr2 <- colnames(snppca)[which(fly.qv1.npp.chr2 < 0.001)]) ## identify which SNPs these are

##  [1] "3205.G"   "3205.A"   "95522.T"  "95522.C"  "110380.A" "110380.T"
##  [7] "178736.T" "178736.C" "224757.T" "224757.G" "249489.G" "249489.A"
## [13] "295197.T" "295197.C" "337949.C" "337949.T" "347923.A" "347923.G"
## [19] "425629.G" "425629.A" "568526.C" "568526.G" "590620.G" "590620.A"
## [25] "744599.C" "744599.T" "745747.C" "745747.T" "747288.T" "747288.C"
## [31] "858669.T" "858669.C"

## visualizing results


## Manhattan plot with causal loci
 
qvalues <- fly.qv1.prec
qvalues.chr2 <- fly.qv1.prec.chr2

row <- as.data.frame(qvalues)
str(row)

## 'data.frame':    3100 obs. of  1 variable:
##  $ V1: num  0.604 0.604 0.999 0.999 0.999 ...

rownames(row) <- 1:nrow(row)

row.chr2 <- as.data.frame(qvalues.chr2)
str(row.chr2)

## 'data.frame':    1862322 obs. of  1 variable:
##  $ V1: num  1 1 1 1 1 ...

rownames(row.chr2) <- 1:nrow(row.chr2)


(causal.set <- as.numeric(rownames(row)[which(row < 0.001)]))

## [1]   67   68 2161 2162 2175 2176

(fly.FDR.prec <- gsub("[^0-9]", "", fly.FDR.prec))

## [1] "162369" "162369" "163496" "163496" "163504" "163504"

head(afis[afis$variant.id == 162369, c(2, 6, 7, 8, 9, 11, 21, 32)], n = 2)

##    variant.id                   col    gene chr     pos genotype   type
## 1:     162369 upstream_gene_variant CG15418  2L 3623241      T,C pooled
## 2:     162369 upstream_gene_variant CG15418  2L 3623241      T,C pooled
##      stationId
## 1: AUM00011389
## 2: AUM00011389

head(afis[afis$variant.id == 163496, c(2, 6, 7, 8, 9, 11, 21, 32)], n = 2)

##    variant.id            col gene chr     pos genotype   type   stationId
## 1:     163496 intron_variant  for  2L 3642321      G,A pooled AUM00011389
## 2:     163496 intron_variant  for  2L 3642321      G,A pooled AUM00011389

head(afis[afis$variant.id == 163504, c(2, 6, 7, 8, 9, 11, 21, 32)], n = 2)

##    variant.id            col gene chr     pos genotype   type   stationId
## 1:     163504 intron_variant  for  2L 3642581      G,A pooled AUM00011389
## 2:     163504 intron_variant  for  2L 3642581      G,A pooled AUM00011389

(causal.set.chr2 <- as.numeric(rownames(row.chr2)[which(row.chr2 < 0.001)]))

##   [1]    8335    8336   39089   39090   45925   45926   77569   77570   91269
##  [10]   91270  132685  132686  147341  147342  266101  266102  266683  266684
##  [19]  302205  302206  519505  519506  625735  625736  660431  660432  662379
##  [28]  662380  662635  662636  662661  662662  672287  672288  789701  789702
##  [37]  803555  803556  804963  804964  850903  850904  852743  852744  885291
##  [46]  885292 1003143 1003144 1016873 1016874 1066663 1066664 1097045 1097046
##  [55] 1181605 1181606 1219657 1219658 1221031 1221032 1284071 1284072 1284465
##  [64] 1284466 1310821 1310822 1321549 1321550 1327407 1327408 1337029 1337030
##  [73] 1346873 1346874 1367921 1367922 1422971 1422972 1441219 1441220 1517919
##  [82] 1517920 1517925 1517926 1552999 1553000 1570079 1570080 1570499 1570500
##  [91] 1709595 1709596 1727421 1727422 1756843 1756844 1774303 1774304 1803125
## [100] 1803126 1815307 1815308 1818119 1818120

(fly.FDR.prec.chr2 <- gsub("[^0-9]", "", fly.FDR.prec.chr2))

##   [1] "4341"   "4341"   "20722"  "20722"  "24540"  "24540"  "42028"  "42028" 
##   [9] "49396"  "49396"  "71603"  "71603"  "79614"  "79614"  "143754" "143754"
##  [17] "144065" "144065" "163496" "163496" "281854" "281854" "339746" "339746"
##  [25] "358648" "358648" "359700" "359700" "359838" "359838" "359851" "359851"
##  [33] "365020" "365020" "428037" "428037" "435493" "435493" "436264" "436264"
##  [41] "460821" "460821" "461800" "461800" "479192" "479192" "542516" "542516"
##  [49] "549898" "549898" "576471" "576471" "592567" "592567" "638134" "638134"
##  [57] "658660" "658660" "659375" "659375" "693165" "693165" "693375" "693375"
##  [65] "707347" "707347" "713129" "713129" "716215" "716215" "721348" "721348"
##  [73] "726550" "726550" "737646" "737646" "766843" "766843" "776548" "776548"
##  [81] "817451" "817451" "817454" "817454" "835825" "835825" "844869" "844869"
##  [89] "845091" "845091" "918074" "918074" "927644" "927644" "943283" "943283"
##  [97] "952506" "952506" "967638" "967638" "973978" "973978" "975445" "975445"

data.frame(causal.set.chr2, fly.FDR.prec.chr2)

##     causal.set.chr2 fly.FDR.prec.chr2
## 1              8335              4341
## 2              8336              4341
## 3             39089             20722
## 4             39090             20722
## 5             45925             24540
## 6             45926             24540
## 7             77569             42028
## 8             77570             42028
## 9             91269             49396
## 10            91270             49396
## 11           132685             71603
## 12           132686             71603
## 13           147341             79614
## 14           147342             79614
## 15           266101            143754
## 16           266102            143754
## 17           266683            144065
## 18           266684            144065
## 19           302205            163496
## 20           302206            163496
## 21           519505            281854
## 22           519506            281854
## 23           625735            339746
## 24           625736            339746
## 25           660431            358648
## 26           660432            358648
## 27           662379            359700
## 28           662380            359700
## 29           662635            359838
## 30           662636            359838
## 31           662661            359851
## 32           662662            359851
## 33           672287            365020
## 34           672288            365020
## 35           789701            428037
## 36           789702            428037
## 37           803555            435493
## 38           803556            435493
## 39           804963            436264
## 40           804964            436264
## 41           850903            460821
## 42           850904            460821
## 43           852743            461800
## 44           852744            461800
## 45           885291            479192
## 46           885292            479192
## 47          1003143            542516
## 48          1003144            542516
## 49          1016873            549898
## 50          1016874            549898
## 51          1066663            576471
## 52          1066664            576471
## 53          1097045            592567
## 54          1097046            592567
## 55          1181605            638134
## 56          1181606            638134
## 57          1219657            658660
## 58          1219658            658660
## 59          1221031            659375
## 60          1221032            659375
## 61          1284071            693165
## 62          1284072            693165
## 63          1284465            693375
## 64          1284466            693375
## 65          1310821            707347
## 66          1310822            707347
## 67          1321549            713129
## 68          1321550            713129
## 69          1327407            716215
## 70          1327408            716215
## 71          1337029            721348
## 72          1337030            721348
## 73          1346873            726550
## 74          1346874            726550
## 75          1367921            737646
## 76          1367922            737646
## 77          1422971            766843
## 78          1422972            766843
## 79          1441219            776548
## 80          1441220            776548
## 81          1517919            817451
## 82          1517920            817451
## 83          1517925            817454
## 84          1517926            817454
## 85          1552999            835825
## 86          1553000            835825
## 87          1570079            844869
## 88          1570080            844869
## 89          1570499            845091
## 90          1570500            845091
## 91          1709595            918074
## 92          1709596            918074
## 93          1727421            927644
## 94          1727422            927644
## 95          1756843            943283
## 96          1756844            943283
## 97          1774303            952506
## 98          1774304            952506
## 99          1803125            967638
## 100         1803126            967638
## 101         1815307            973978
## 102         1815308            973978
## 103         1818119            975445
## 104         1818120            975445

quartz()

layout(matrix(c(0, 1, 1, 0,
                0, 1, 1, 0,
                0, 2, 2, 0,
                0, 2, 2, 0), nrow = 4, ncol = 4, byrow = TRUE))


plot(-log10(qvalues), pch = 19, cex = 1, bg = "black", col = alpha("gray", 0.3), xlab = "SNP", las = 1, main = "for")
points(causal.set[c(3, 5)], -log10(qvalues)[causal.set[c(3, 5)]], pch = 19, col = "red")
text(causal.set[c(3, 5)], (-log10(qvalues)[causal.set[c(3, 5)]] - 0.4), fly.FDR.prec[c(3, 5)], col = "red")


plot(-log10(qvalues.chr2), pch = 19, cex = 1, bg = "black", col = alpha("gray", 0.3), xlab = "SNP", las = 1, main = "Chr 2L")
points(causal.set.chr2[c(19, 20)], -log10(qvalues.chr2)[causal.set.chr2[c(19, 20)]], pch = 19, col = "red")
text(causal.set.chr2[c(19, 20)], (-log10(qvalues.chr2)[causal.set.chr2[c(19, 20)]] - 0.5), fly.FDR.prec.chr2[c(19, 20)], col = "red", cex = 0.8)

quartz()

## png("adaptation_revision2.png", width = 7, height = 7, units = "in", res = 300)

## pdf("adaptation_revision2.pdf")


layout(matrix(c(1, 1, 2, 2,
                1, 1, 2, 2,
                3, 3, 4, 4,
                3, 3, 4, 4), nrow = 4, ncol = 4, byrow = FALSE))

hist(fly.pv1.prec$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "for", ylim = c(0, 350))
legend("top", paste("K = 9\nGIF = ", round(fly.pv1.prec$gif, 3)), bty = "n")
mtext("(a)", side = 2, at = 365, line = 2.5, las = 1, font = 2, family = "serif")

plot(-log10(qvalues), pch = 19, cex = 1, bg = "gray", col = alpha("gray", 0.6), xlab = "SNP", las = 1)
points(causal.set[c(3, 5)], -log10(qvalues)[causal.set[c(3, 5)]], pch = 19, col = "red")
text(causal.set[c(3, 5)], (-log10(qvalues)[causal.set[c(3, 5)]] - 0.4), fly.FDR.prec[c(3, 5)], col = "red")

par(mar = c(6, 7, 4.1, 1.5), mgp = c(4, 1.5, 0))
hist(fly.pv1.prec.chr2$calibrated.pvalue[ , 1], xlab = "GIF-adjusted p-values", las = 1, main = "Chr 2L", ylim = c(0, 120000))
legend("top", paste("K = 8\nGIF = ", round(fly.pv1.prec.chr2$gif, 3)), bty = "n")
mtext("(b)", side = 2, at = 122000, line = 4.5, las = 1, font = 2, family = "serif")

par(mar = c(5.1, 4.1, 4.1, 1.5), mgp = c(3, 1, 0))
plot(-log10(qvalues.chr2), pch = 19, cex = 1, bg = "black", col = alpha("gray", 0.3), xlab = "SNP", las = 1, main = "")
points(causal.set.chr2[-c(19, 20)], -log10(qvalues.chr2)[causal.set.chr2[-c(19, 20)]], pch = 19, col = "black")
points(causal.set.chr2[c(19, 20)], -log10(qvalues.chr2)[causal.set.chr2[c(19, 20)]], pch = 19, col = "red")
text(causal.set.chr2[c(19, 20)], (-log10(qvalues.chr2)[causal.set.chr2[c(19, 20)]] - 0.5), fly.FDR.prec.chr2[c(19, 20)], col = "red", cex = 0.8)

## dev.off()

Figure 5. Genotype-environment association test based on a latent factor mixed model with ridge penalty. (a) Distribution of adjusted p-values using the default genomic inflation factor, and the for loci (SNPs) potentially affected by precipitation. Likewise, (b) displays the same analysis for the entire chromosome 2L. Bold dots represent loci considered to be under selection according to a False Discovery Rate of 0.001.