SI_spatial_avg.Rmd

---
title: "Supporting Information: Species relationships in the extremes and their influence on community stability"
author: "Shyamolina Ghosh, Kathryn L. Cottingham, Daniel C. Reuman"
date: ""
fontsize: 11pt
geometry: "left=1.5cm,right=1.5cm,top=1cm,bottom=1.8cm"

output: 
  pdf_document:
    number_sections: yes
    keep_tex: yes
    fig_caption: yes

header-includes:
      - \usepackage{xr} \externaldocument[MT-]{MT_spatial_avg}
      - \input{head_supp.sty}

mainfont: Times New Roman 
tables: True
link-citations: True
urlcolor : blue
indent : True
csl: ecology-letters.csl
bibliography: REF_CSS.bib
---

\pagenumbering{roman}
\tableofcontents
\listoftables
\listoffigures

\pagenumbering{arabic}

<!--Basic setup-->
```{r setup, echo=F, message=FALSE}
knitr::opts_chunk$set(echo = TRUE, fig.pos = "H")
seed<-101 # common seed 
library(dplyr)
library(kableExtra)
library(stringr)
library(rmarkdown)
#library(xfun)
# families<-c(1,3:10,13,14,16:20)
source("mtime.R") #A function needed for caching
```

<!-- reading raw data for hays and creates results folder for hays-->
```{r read hays_rawdata,echo=F,results="hide"}
# later add results="hide" , echo=F in chunk header
set.seed(seed)
# basal cover data
cover<-read.csv("./Data/HaysData/allrecords.csv") # this is basal cover

# quadrat sampling ? Yes or not
quadsamp<-read.csv("./Data/HaysData/quadrat_inventory.csv")

quad_info<-read.csv("./Data/HaysData/quadrat_info.csv")
count_quad_grazed<-table(quad_info$Grazing) #36 No grazing, 15 Yes grazed
quad_grazing<-quad_info$quadrat[which(quad_info$Grazing=="Yes")]
quad_grazing<-as.character(quad_grazing) # we plan to exclude these 15 plots later 
                                           # when calculating averaged basal cover for each species


# Information on species list
spinfo<-read.csv("./Data/HaysData/species_list.csv")
nonplant<-as.character(spinfo$species[which(spinfo$type=="remove")]) 
#These are not plant species as per metadata file (page 8 in pdf) for more info: though I doubt on mixed grass and polygonum spp.]
#"Bare ground"    "Fragment"       "Mixed grass"    "Polygonum spp." "Unknown"
spinfo[which(spinfo$type=="remove"),] # for details

#-------creating results folder for hays data------------
if(!dir.exists("./Results/hays_results")){
  dir.create("./Results/hays_results/")
}

if(!dir.exists("./Results/hays_results/skewness_results")){
  dir.create("./Results/hays_results/skewness_results/")
}
```

<!-- preparing hays spatial average data in usual format for common+pseudo sp.-->
```{r prepare_hays_spaceavg,results="hide",echo=F,cache=T, cache.extra=list(seed,cover,spinfo,quad_grazing,nonplant)}
# later add results="hide" , echo=F in chunk header
set.seed(seed)
library(stringr)
py<-(cover$plotyear)
plots<-str_sub(string = py,start=1,end=str_length(py)-2) #extracting only plot no.
yrs<-str_sub(string = py,start=-2)        #extracting last 2digit of an year
cover<-cbind(plot=plots,yr=as.numeric(yrs),cover)

uyr<-sort(unique(cover$yr)) # 41 unique years
uplot<-sort(unique(cover$plot)) # 51 unique quadrats
uplot<-as.character(uplot)

splist<-sort(unique(cover$species))
splist<-as.character(splist)  

# check with raw data:
spcount<-as.data.frame(table(cover$species))
checkcount<-(spinfo$count==spcount$Freq)
all(checkcount=T) #These should be true

#sink("./Results/hays_results/hays_myquadsamplinginfo.txt", append=TRUE, split=TRUE)

hays_array<-array(Inf,dim=c(length(uyr),length(uplot),length(splist)),dimnames = list(uyr,uplot,splist))

for(iyr in 1:length(uyr)){
  tempo<-subset(cover,cover$yr==uyr[iyr])
  for(iplot in 1:length(uplot)){
    tempo2<-subset(tempo,tempo$plot==uplot[iplot])
    if(nrow(tempo2)==0){
      cat("iyr = ",iyr, " Year : ",uyr[iyr], "iplot = ",iplot," plot : ",uplot[iplot],"Not surveyed : --NA--","\n")
      hays_array[iyr,iplot,]<-NA # NA means this plot is not surveyed in that given year : so all species should get NA as basal cover
    }else{
      z<-split( tempo2 , f = tempo2$species ) # This will split the data tempo2 by its species levels (which is a factor)
      for(isp in 1:length(z)){
        cs<-sum(z[[isp]]$area)  # if cs=0 that means that sp. is absent for that plot and for that given year 
                                # it will sum over the area of a particular sp. taken from all ID in a quadrat
        hays_array[iyr,iplot,isp]<-cs
      }
    }
  }
}

#check with prepared data
# check1 : Total basal cover reported for any specific year from 1 sq. meter quadrat should be ~10000 cm^2
check_matcover<-apply(hays_array,FUN=sum,MARGIN=c(1,2),na.rm=T)
check_matcover[check_matcover==0]<-NA
# This check_matcover matrix contains entries either NA for plots not surveyed or 9999...value ~10000 cm^2
range(check_matcover,na.rm=T)
#hist(check_matcover,ylim=c(0,5),col="grey",breaks=100) # check with plot

#sink()

#It's a check : There should not be all zeros for all sp on any plot-year combination
s<-apply(hays_array,FUN = sum,MARGIN = c(1,2)) #This matrix should contain either NA or some +ve number
any(s==0,na.rm = T)  #There should not be any zeros

saveRDS(hays_array,"./Results/hays_results/hays_array_41yr_51plot_151sp.RDS")

# Now calculate average basal cover over all plots (except 15 grazed) for any given year for all species
avg_cover_over_plots_hays<-matrix(Inf,nrow=length(uyr),ncol=length(splist))
rownames(avg_cover_over_plots_hays)<-c(1932:1972)
colnames(avg_cover_over_plots_hays)<-splist
avg_cover_over_plots_hays<-as.data.frame(avg_cover_over_plots_hays)

#check2:

#from raw data
plotsurveyed<-rowSums(!is.na(quadsamp))-1 

#from prepared data
isp<-1 # This should be same for any species
numplot_surveyed_by_year<-apply(hays_array[,,isp], MARGIN = 1, function(x){sum(is.finite(x))}) 

all((plotsurveyed==numplot_surveyed_by_year)==T) #They are same.

for(isp in c(1:length(splist))){
  m<-hays_array[,,isp]
  
  id_grz<-which(colnames(m)%in%quad_grazing) 
  m<-m[,-id_grz] #these grazed plots should be excluded 

  idNA<-which(apply(m, 1, function(x) all(is.na(x)))) #This should be empty : means should not be all NA's along any row
  if(length(idNA)==0){
    cs<-apply(m,FUN = mean,MARGIN = 1,na.rm=T)
    avg_cover_over_plots_hays[,isp]<-cs
  }else{
    cat("Caution : This means for a given year not any plot was surveyed!!!")
  }
}
anyNA(avg_cover_over_plots_hays) #This should be FALSE

#---------saving a big matrix 41yrs by 151 sp. which has each sp (including non-plant) timeseries along each column : hays data------
saveRDS(avg_cover_over_plots_hays,"./Results/hays_results/timeseries_matrix_spatialavg_allsp_hays.RDS")

# Now screen the species for hays
count0<-apply(avg_cover_over_plots_hays,2,function(x){sum(x==0)}) # This is the count of zero 
count0_allsp<-as.data.frame(count0)

id_nonplant<-which(colnames(avg_cover_over_plots_hays)%in%nonplant)

id_common_sp_hays<-which(count0_allsp$count0<=6) # sp. present for atleast 35 yrs
id_common_sp_hays<-setdiff(id_common_sp_hays,id_nonplant)

id_rare_sp_hays<-which(count0_allsp$count0>=39) # sp. present max. for 2 yrs
id_rare_sp_hays<-setdiff(id_rare_sp_hays,id_nonplant)

id_interm_sp_hays<-which(count0_allsp$count0<39 & count0_allsp>6)
id_interm_sp_hays<-setdiff(id_interm_sp_hays,id_nonplant)

# check
tot_sp_hays<-length(id_common_sp_hays)+length(id_interm_sp_hays)+length(id_rare_sp_hays)+length(id_nonplant)
tot_sp_hays==151

sp_category_hays<-as.data.frame(matrix(NA,nrow=length(splist),ncol=3))
colnames(sp_category_hays)<-c("id","sp","category")
sp_category_hays$id<-c(1:length(splist))
sp_category_hays$sp<-splist
sp_category_hays$category[id_common_sp_hays]<-"C" #common sp. for Hays data
sp_category_hays$category[id_interm_sp_hays]<-"I" # Intermediate sp. for Hays data
sp_category_hays$category[id_rare_sp_hays]<-"R" # rare sp. for Hays data

#---------saving a 151sp by 2 matrix indicating C/I/R category for each hays-sp (for non-plant = NA)---------
saveRDS(sp_category_hays,"./Results/hays_results/all_sp_category_spatialavg_hays.RDS")

# Now make hays_spaceavg data in your format
hays_spaceavg<-vector("list",1)
names(hays_spaceavg)<-"avg.basal.cover"

sp.screened.data<-vector("list",length(id_common_sp_hays))
names(sp.screened.data)<-splist[id_common_sp_hays]

for(isp in 1:length(id_common_sp_hays)){
  sp.screened.data[[isp]]<-data.frame(Year=c(1932:1972),Dat=avg_cover_over_plots_hays[,id_common_sp_hays[isp]])
}

hays_spaceavg$avg.basal.cover<-sp.screened.data

# Append the pseudo species = merged sp. of I & R category
pseudo_hays_IR<-apply(X=avg_cover_over_plots_hays[,which(sp_category_hays$category%in%c("I","R"))],MARGIN = 1,FUN = sum)
pseudo_hays<-data.frame(Year=c(1932:1972),Dat=pseudo_hays_IR)
pseudo_hays<-list(pseudo_hays)

hays_spaceavg$avg.basal.cover<-append(hays_spaceavg$avg.basal.cover,pseudo_hays)
names(hays_spaceavg$avg.basal.cover)[[length(id_common_sp_hays)+1]]<-"pseudo_hays"

#---------saving the spatial avg. data for hays with whigh we will do taildep. analysis later----------------
saveRDS(hays_spaceavg,"./Results/hays_results/hays_spaceavg_data_CP.RDS")

#-----------saving a matrix with timeseries of common + 1 pseudo (all other merged into 1) sp. for hays------
ts_mat_CP_hays<-cbind(avg_cover_over_plots_hays[,id_common_sp_hays],pseudo_hays_IR)
saveRDS(ts_mat_CP_hays,"./Results/hays_results/skewness_results/ts_mat_CP_hays.RDS")

#------------------------- plot total timeseries for hays data ------------------------------------------------
pdf("./Results/hays_results/skewness_results/total_timeseries.pdf",height=6,width=6)
op<-par(mar=c(5.1, 5.1, 1.1, 2.1))

total_ts_C<-apply(X=avg_cover_over_plots_hays[,id_common_sp_hays],MARGIN = 1,FUN = sum)
total_ts_CI<-apply(X=avg_cover_over_plots_hays[,sort(c(id_common_sp_hays,id_interm_sp_hays))],MARGIN = 1,FUN = sum)
total_ts_CIR<-apply(X=avg_cover_over_plots_hays[,sort(c(id_common_sp_hays,id_interm_sp_hays,id_rare_sp_hays))],MARGIN = 1,FUN = sum)
total_ts<-cbind(total_ts_C,total_ts_CI,total_ts_CIR)
plot(c(1932:1972),total_ts[,1],ylim=c(range(total_ts)[1],10000),col=rgb(1,0,0,0.5),type="b",pch=16,xlab="Years",ylab=expression("Total basal cover, cm"^2),xlim=c(1932,1972),cex.lab=2,cex.axis=2)
lines(c(1932:1972),total_ts[,2],type="b",pch=16,col=rgb(0,1,0,0.5))
lines(c(1932:1972),total_ts[,3],type="b",col="black",pch=1)
legend("topright",c("common sp.","common + intermediate sp.","all sp. including rare"),lty=c(1,1,1),
       col=c(rgb(1,0,0,0.5),rgb(0,1,0,0.5),"black"),pch=c(16,16,1),bty="n",cex=1.2)
par(op)
dev.off()


#------plot hays_spaceavg data for all screened sp for all yearspan-------------------------------
pdf("./Results/hays_results/hays_spaceavg_screenedsp_avgcover.pdf",height = 21,width=21)
op<-par(mfrow=c(6,5),mar=c(5,3,3,3))
for (i in c(1:length(hays_spaceavg$avg.basal.cover))){
  plot(hays_spaceavg$avg.basal.cover[[i]]$Year,hays_spaceavg$avg.basal.cover[[i]]$Dat,col=rgb(0,0,1,0.3),pch=19,type="b",
       ylim=c(0,max(hays_spaceavg$avg.basal.cover[[i]]$Dat,na.rm=T)),xlab="Year (1932-1972)")
  abline(h=0)
  n0<-sum(hays_spaceavg$avg.basal.cover[[i]]$Dat==0)
  mtext(paste0("sp = ",i," : ",names(hays_spaceavg$avg.basal.cover)[i],", n0=",n0,sep=""))
}
par(op)
dev.off()

# ---------------------generate copula plots for all selected splist_hays_spaceavg--------------------
source("./vivj_matrix.R")
include_indep<-FALSE
good_sp<-c(1:length(hays_spaceavg[[1]]))
lensp<-length(good_sp)
pdf("./Results/hays_results/copula_hays_spaceavg.pdf",height=2*lensp,width = 2*lensp)
op<-par(mfrow=c(lensp,lensp),mar=c(3,3,3,3), mgp=c(1.5,0.5,0))
for(i in c(1:lensp)){
  for(j in c(1:lensp)){
    vivj_matrix(d_allsp=hays_spaceavg,loc=1,
                i=good_sp[i],j=good_sp[j],level=0.05,
                ploton=T,onbounds=T,lb=0,ub=0.5,include_indep=include_indep)
  }
}
par(op)
dev.off()
```

<!-- non-parametric analysis for cor stat with common + pseudo sp. in hays-->
```{r npa_hays_spaceavg, echo=F, cache=T, cache.extra=list(seed,hays_spaceavg,mtime("NonParamStat.R"),mtime("vivj_matrix.R"),mtime("CopulaFunctions.R"), mtime("CopulaFunctions_flexible.R"))}

set.seed(seed) 
source("./NonParamStat.R")

if(!dir.exists("./Results/hays_results/corstat_hays_spaceavg_results")){
  dir.create("./Results/hays_results/corstat_hays_spaceavg_results")
} 

resloc<-"./Results/hays_results/corstat_hays_spaceavg_results/"
nbin_hays<-2
include_indep<-FALSE 

corstat_hays_spaceavg<-multcall(d_allsp=hays_spaceavg,
                           loc=1,
                           resloc=resloc,
                           good_sp=c(1:length(hays_spaceavg[[1]])),
                           nbin=nbin_hays,include_indep=include_indep)

saveRDS(corstat_hays_spaceavg,paste(resloc,file="corstat_hays_spaceavg_nbin_",nbin_hays,".RDS",sep=''))
```

<!-- genarating Corl - Coru plots from non-parametric analysis with common + pseudo sp. in hays-->
```{r plot_res_hays_npa_spaceavg, echo=F, results="hide",cache=T,warning=F, cache.extra=list(seed,hays_spaceavg,corstat_hays_spaceavg,nbin_hays,mtime("NonParamStat_matrixplot.R"),mtime("mycorrplot_with_sig.R"),mtime("tailsignif.R"),mtime("CopulaFunctions_flexible.R"))}
set.seed(seed)
source("./NonParamStat_matrixplot.R")

resloc<-"./Results/hays_results/corstat_hays_spaceavg_results/"

ub<-1/nbin_hays
numpts<-length(hays_spaceavg$avg.basal.cover[[1]]$Year)  #for hays data 41years
numsims<-10000
CI<-c(0.025,0.975)
sigtest<-FALSE
include_indep<-FALSE 

CorlmCoru_hays_spaceavg<-NonParamStat_matrixplot(data=corstat_hays_spaceavg,resloc=resloc,tagon=T,
                                                 type="lower",wd=13,ht=13,
                                                 sigtest=sigtest,ub=ub,numpts=numpts,
                                                 numsims=numsims,CI=CI,include_indep=include_indep)
saveRDS(CorlmCoru_hays_spaceavg,paste(resloc,file="CorlmCoru_hays_spaceavg_nbin_",nbin_hays,".RDS",sep='')) 
```

```{r binom_sigtest_hays, echo=F}
# only run this chunk if sigtest=TRUE in previous chunks
#source("./binomial_sigtest.R")
# set.seed(seed=101) # not needed as binom test uses two-sided binom.test function 
# also it does not matter here binom_sig="LT" or binom_sig="UT"
#sigtest_hays<-binomial_sigtest(ylist=CorlmCoru_hays_spaceavg,binom_sig="LT") 
```

<!--get appropriate surrogates for hays with common+pseudo sp.-->
```{r make_surrogs_CP_hays, echo=F, results="hide", warning=F, cache=T, cache.extra=list(ts_mat_CP_hays,mtime("PPSurrogObjFun.R"),mtime("pwlin.R"),mtime("getmap.R"),mtime("alignranks.R"),mtime("SurrogsForHays.R"))}

source("SurrogsForHays.R")
#after this script runs the surrogates will be in the variable surrogs_CP_hays
# and saved as ./Results/hays_results/skewness_results/pp_surrogs_hays_CP/HaysSurrogates.RDS
```

<!-- genarating stability based results and plots for hays-->
```{r skewness_hays_spaceavg_PP, echo=F, results="hide", warning=F, cache=T, cache.extra=list(seed,ts_mat_CP_hays,surrogs_CP_hays,mtime("make_tab_stability_assessment.R"),mtime("mycvsq.R"),mtime("SkewnessAnd3CentMom.R"))}

set.seed(seed)
source("make_tab_stability_assessment.R")

# randomly sample numsurrog surrogate matrices from Pearson preserving array
numsurrog<-10000
id_surrogs<-sample(c(1:dim(surrogs_CP_hays)[3]),numsurrog,replace=F)
surrogs_CP_hays_sampled<-surrogs_CP_hays[,,id_surrogs]

stability_CP_hays<-make_tab_stability(m=ts_mat_CP_hays,surrogs = surrogs_CP_hays_sampled, surrogs_given = T)
saveRDS(stability_CP_hays,"./Results/hays_results/skewness_results/stability_CP_hays.RDS")

ans<-(stability_CP_hays$df_stability)
rownames(ans)<-"C+P"
class(ans)

write.csv(ans,"./Results/hays_results/skewness_results/hays_stability_CP.csv")

#--------------generate plots with hays stability results : CVsq and skewness-------------------------------

pdf("./Results/hays_results/skewness_results/hays_pearson_preserving_results_cvsq_skw_plots.pdf",height=2.5,width=10)

op<-par(mfrow=c(1,2),mar=c(6,5,0.2,2),mgp=c(3,1,0))

#--------------CVsq histogrm-------------------------------------
xlm<-range(ans$cvsq_real,ans$cvsq_indep,stability_CP_hays$cvsq_surrogs)

hist(stability_CP_hays$cvsq_surrogs,col="grey",border=F,breaks=100,xaxt="n",xlim=xlm,
     xlab=expression(paste(CV^2," of surrogates: Hays")),main="",cex.lab=1.5)
axis(side=1, at=round(c(xlm[1],xlm[2]),3))
points(x=ans$cvsq_real,y=0,col="black",pch=20,cex=1) # actual CVsq from real data
#abline(v=ans$cvsq_real,col="black") # actual CVsq from real data

#95% quantiles
abline(v=ans$cvsq_ntd_0.025CI,col="black",lty="dotted") 
abline(v=ans$cvsq_ntd_0.975CI,col="black",lty="dotted")

#50% quantiles
#CI_cvsq_50<-quantile(stability_CP_hays$cvsq_surrogs,probs=c(.25,.75))
#abline(v=CI_cvsq_50[1],col="green")
#abline(v=CI_cvsq_50[2],col="green")

# Cvsq with no tail dep.
points(x=ans$cvsq_ntd_median,y=0,col="black",pch=2,cex=1.5)

# Cvsq if indep.
abline(v=ans$cvsq_indep,col="black",lty="dashed")


#--------------skw histogrm-------------------------------------
xlm<-range(ans$skw_real,ans$skw_indep,stability_CP_hays$skw_surrogs)

hist(stability_CP_hays$skw_surrogs,col="grey",border=F,breaks=100,xaxt="n",xlim=xlm,
     xlab="Skewness of surrogates: Hays",main="",cex.lab=1.5)
axis(side=1, at=round(c(xlm[1],0,xlm[2]),3))
points(x=ans$skw_real,y=0,col="black",pch=20,cex=1) # actual skw from real data

#95% quantiles
abline(v=ans$skw_ntd_0.025CI,col="black",lty="dotted") 
abline(v=ans$skw_ntd_0.975CI,col="black",lty="dotted")

#50% quantiles
#CI_skw_50<-quantile(stability_CP_hays$skw_surrogs,probs=c(.25,.75))
#abline(v=CI_skw_50[1],col="green")
#abline(v=CI_skw_50[2],col="green")

# Skewness with no tail dep.
points(x=ans$skw_ntd_median,y=0,col="black",pch=2,cex=1.5)

# Skewness if indep.
abline(v=ans$skw_indep,col="black",lty="dashed")

# add legend
#legend("topright",lty=c(1,NA,2,3),pch=c(NA,1,NA,NA),
#       horiz = F, bty="n",
#       legend=c("real value","no Tail-dep. (median)","95%CI","Indep."))
par(op)
dev.off()

pdf("./Results/hays_results/skewness_results/legend_plot.pdf",height=0.5,width=12.5)
op<-par(mar=c(0.1,0.1,0.1,0.1),mgp=c(1,1,0))

plot.new()
legend_order <- matrix(1:4,ncol=4,byrow =F)
legend("topright",lty=c(NA,NA,3,2)[legend_order],pch=c(20,2,NA,NA)[legend_order],
       bty="n", pt.cex=1.2, cex=1.2,
       legend=c("actual community value (com),",expression(paste("median (nta surrogates),")),"95% CI (nta surrogates),","no interaction (ind)")[legend_order],x.intersp=0, ncol=4)
#par(op)
dev.off()
```

```{r few_checks_on_surrogs_hays, echo=F, results="hide", cache=T, cache.extra=list(seed,ts_mat_CP_hays,surrogs_CP_hays,mtime("PPsurrogs_tests.R"),mtime("get_var_ratio.R"))}
set.seed(seed)
source("./PPsurrogs_tests.R")

# randomly sample numsurrog surrogate matrices from Pearson preserving array
numsurrog<-10000
id_surrogs<-sample(c(1:dim(surrogs_CP_hays)[3]),numsurrog,replace=F)
surrogs_CP_hays_sampled<-surrogs_CP_hays[,,id_surrogs]

ans<-PPsurrogs_tests(m=ts_mat_CP_hays, surrogs=surrogs_CP_hays_sampled)
#ans<-PPsurrogs_tests(m=ts_mat_CP_hays, surrogs=surrogs_CP_hays)
saveRDS(ans,"./Results/hays_results/skewness_results/pp_surrogs_hays_CP/PPsurrogs_tests_with_HaysSurrogates.RDS")
```

```{r plotter_PPsurrogs_check_hays, echo=F, results="hide", cache=T, cache.extra=list(seed,ts_mat_CP_hays,mtime("./Results/hays_results/skewness_results/pp_surrogs_hays_CP/PPsurrogs_tests_with_HaysSurrogates.RDS"),mtime("Plotter_PPsurrogs_tests.R"))}

set.seed(seed)
source("./Plotter_PPsurrogs_tests.R")

m<-ts_mat_CP_hays
ans<-readRDS("./Results/hays_results/skewness_results/pp_surrogs_hays_CP/PPsurrogs_tests_with_HaysSurrogates.RDS")
resloc<-"./Results/hays_results/skewness_results/pp_surrogs_hays_CP/"

Plotter_PPsurrogs_tests(m=m,ans=ans,resloc=resloc,tag_legend = c("(A)","(C)","(E)"))
```

<!-- preparing spatial avg data in usual format with common+pseudo sp. for knz data-->
```{r read_and_prepare_knz_data,echo=F,results="hide",cache=T, cache.extra=list(seed,mtime("./Data/KnzData/KNZ_Data_downloaded/knb-lter-knz.69.15/PVC021.csv"),mtime("./Data/KnzData/KNZ_Data_downloaded/KFH011.csv"))}
set.seed(seed)
source("./data_cleaning_for_KNZ.R")
```

```{r screen_sp_for_knz_data,echo=F,results="hide",cache=T,cache.extra=list(seed,mtime("data_cleaning_for_KNZ.R"))}
set.seed(seed)
# knz_soiltype<-"t" # given variable for soil type in ./data_cleaning_for_KNZ.R

# the data cleaning file saves ts_all_sp_knz matrix as "./Results/knz_results/ts_all_sp_knz_soiltype_t.RDS"

#================ screening for common-intermediate-rare species category ================

ts_all_sp_knz<-readRDS(paste(resloc_knz,"ts_all_sp_knz_soiltype_",knz_soiltype,".RDS",sep=""))

# count on zero values for cover for each species
nyr_0_eachsp<-apply(ts_all_sp_knz,MARGIN = 2,FUN = function(x){sum(x==0)})
nyr_0_eachsp<-as.data.frame(nyr_0_eachsp)

# common sp. for KNZ : these sp present for atleast 35 years, i.e., absent for max 1 years
id_common_sp_knz<-which(nyr_0_eachsp$nyr_0_eachsp<=1) 
ts_common_knz<-ts_all_sp_knz[,id_common_sp_knz]

# rare sp. for KNZ : absent for atleast 34 years, i.e. present max only for 2 years
id_rare_sp_knz<-which(nyr_0_eachsp$nyr_0_eachsp>=34)
ts_rare_knz<-ts_all_sp_knz[,id_rare_sp_knz]

# normal or intermediate sp. for KNZ
id_interm_sp_knz<-which(nyr_0_eachsp$nyr_0_eachsp>1 & nyr_0_eachsp$nyr_0_eachsp<34)
ts_normal_knz<-ts_all_sp_knz[,id_interm_sp_knz]

# sp. category for KNZ
sp_category_knz<-data.frame(sp=rownames(nyr_0_eachsp),category=NA)
sp_category_knz$category[id_common_sp_knz]<-"C"
sp_category_knz$category[id_interm_sp_knz]<-"I"
sp_category_knz$category[id_rare_sp_knz]<-"R"

#---------saving a 125sp by 2 matrix indicating C/I/R category for each knz-sp---------
saveRDS(sp_category_knz,paste(resloc_knz,"all_sp_category_spatialavg_knz_soiltype_",knz_soiltype,".RDS",sep=""))

#===========================Formatting data for tail-asymmetry analysis========================

# Now make the usual format for KNZ data (with common sp and all other merged into a pseudo species) to 
# be used in tail-asymmetry analysis later

knz_spaceavg<-vector("list",1)
names(knz_spaceavg)<-"avg.percent.cover"

sp.screened.data<-vector("list",length(id_common_sp_knz))
names(sp.screened.data)<-rownames(nyr_0_eachsp)[id_common_sp_knz]

for(isp in 1:length(id_common_sp_knz)){
  sp.screened.data[[isp]]<-data.frame(Year=c(1983:2018),Dat=ts_common_knz[,isp])
}

knz_spaceavg$avg.percent.cover<-sp.screened.data

# Append the pseudo species = merged sp. of I & R category
pseudo_knz_IR<-apply(X=ts_all_sp_knz[,which(sp_category_knz$category%in%c("I","R"))],MARGIN = 1,FUN = sum)
pseudo_knz<-data.frame(Year=c(1983:2018),Dat=pseudo_knz_IR)
pseudo_knz<-list(pseudo_knz)

knz_spaceavg$avg.percent.cover<-append(knz_spaceavg$avg.percent.cover,pseudo_knz)
names(knz_spaceavg$avg.percent.cover)[[length(id_common_sp_knz)+1]]<-"pseudo_knz"

#---------saving the spatial avg. data for knz with whigh we will do taildep. analysis later----------------
saveRDS(knz_spaceavg,paste(resloc_knz,"knz_spaceavg_data_CP_soiltype_",knz_soiltype,".RDS",sep=""))

#-----------saving a dataframe with timeseries of common + 1 pseudo (all other merged into 1) sp. for knz------
ts_CP_knz<-cbind(ts_common_knz,pseudo_knz_IR)
saveRDS(ts_CP_knz,paste(resloc_knz_skw,"ts_CP_knz_soiltype_",knz_soiltype,".RDS",sep="")) 

#-------- time series plot for knz Common sp, Common + Normal(or Intermediate) sp., Common+Normal+Rare sp.----------

pdf(paste(resloc_knz_skw,"total_timeseries_soiltype_",knz_soiltype,".pdf",sep=""),height=6,width=6)
op<-par(mar=c(5.1, 5.1, 1.1, 2.1))

total_ts_C<-apply(X=ts_all_sp_knz[,id_common_sp_knz],MARGIN = 1,FUN = sum)
total_ts_CI<-apply(X=ts_all_sp_knz[,sort(c(id_common_sp_knz,id_interm_sp_knz))],MARGIN = 1,FUN = sum)
total_ts_CIR<-apply(X=ts_all_sp_knz[,sort(c(id_common_sp_knz,id_interm_sp_knz,id_rare_sp_knz))],MARGIN = 1,FUN = sum)
total_ts<-cbind(total_ts_C,total_ts_CI,total_ts_CIR)
plot(c(1983:2018),total_ts[,1],ylim=range(total_ts),col=rgb(1,0,0,0.5),type="b",pch=16,xlab="Years",ylab="Total percent cover",xlim=c(1983,2018),cex.lab=2,cex.axis=2)
lines(c(1983:2018),total_ts[,2],type="b",pch=16,col=rgb(0,1,0,0.5))
lines(c(1983:2018),total_ts[,3],type="b",col="black",pch=1)
legend("topright",c("common sp.","common + intermediate sp.","all sp. including rare"),lty=c(1,1,1),
       col=c(rgb(1,0,0,0.5),rgb(0,1,0,0.5),"black"),pch=c(16,16,1),bty="n",cex=1.2)
par(op)
dev.off()

#------plot knz_spaceavg data for all common sp for all yearspan-------------------------------
good_sp<-c(1:length(knz_spaceavg[[1]]))
lensp<-length(good_sp)

summary_knz_commonsp<-data.frame(sp=names(knz_spaceavg$avg.percent.cover),n0=NA,nTies=NA)
pdf(paste(resloc_knz,"rawplot_knz_spaceavg_commonsp_with_pseudosp_avgcover_soiltype_",knz_soiltype,".pdf",sep=""),height = 0.5*lensp,width=0.5*lensp)
op<-par(mfrow=c(6,5),mar=c(5,5,3,3))
for (i in good_sp){
  n0<-sum(knz_spaceavg$avg.percent.cover[[i]]$Dat==0)
  nTies<-sum(duplicated(knz_spaceavg$avg.percent.cover[[i]]$Dat)==T)
  summary_knz_commonsp$n0[i]<-n0
  summary_knz_commonsp$nTies[i]<-nTies
  if(n0==0){
    col1<-rgb(1,0,0,0.3) # these are the sp. present for all years 
  }else{
    col1<-rgb(0,0,1,0.3)
  }
  plot(knz_spaceavg$avg.percent.cover[[i]]$Year,knz_spaceavg$avg.percent.cover[[i]]$Dat,col=col1,pch=19,
       ylim=c(0,max(knz_spaceavg$avg.percent.cover[[i]]$Dat)),xlab="Year (1983-2018)",type="b",ylab="avg. % cover")
  abline(h=0)
  
  mtext(paste0("sp = ",i," : ",names(knz_spaceavg$avg.percent.cover)[i]," ,nT=",nTies,sep=""))
}
par(op)
dev.off()

# ---------------------generate copula plots for all common sp for knz_spaceavg--------------------
source("./vivj_matrix.R")

include_indep<-FALSE

pdf(paste(resloc_knz,"copulaplot_knz_spaceavg_commonsp_with_pseudosp_avgcover_soiltype_",knz_soiltype,".pdf",sep=""),height=2*lensp,width = 2*lensp)
op<-par(mfrow=c(lensp,lensp),mar=c(3,3,3,3), mgp=c(1.5,0.5,0))
for(i in c(1:lensp)){
  for(j in c(1:lensp)){
    vivj_matrix(d_allsp=knz_spaceavg,loc=1,
                i=good_sp[i],j=good_sp[j],level=0.05,
                ploton=T,onbounds=T,lb=0,ub=0.5,include_indep=include_indep)
  }
}
par(op)
dev.off()
```

<!-- non-parametric analysis for cor stat with common + pseudo sp. in knz-->
```{r npa_knz_spaceavg, echo=F, cache=T, cache.extra=list(seed,knz_spaceavg,mtime("NonParamStat.R"),mtime("vivj_matrix.R"),mtime("CopulaFunctions.R"), mtime("CopulaFunctions_flexible.R"))}

set.seed(seed) 
source("./NonParamStat.R")

resloc_knz_npa<-paste(resloc_knz,"corstat_knz_spaceavg_results/",sep="")

if(!dir.exists(resloc_knz_npa)){
  dir.create(resloc_knz_npa)
}

resloc2<-paste(resloc_knz_npa,"soiltype_",knz_soiltype,"/",sep="")
if(!dir.exists(resloc2)){
  dir.create(resloc2)
}

resloc<-resloc2
nbin_knz<-2
include_indep<-FALSE 

corstat_knz_spaceavg<-multcall(d_allsp=knz_spaceavg,
                               loc=1,
                               resloc=resloc,
                               good_sp=c(1:length(knz_spaceavg[[1]])),
                               nbin=nbin_knz,include_indep=include_indep)

saveRDS(corstat_knz_spaceavg,paste(resloc,file="corstat_knz_spaceavg_nbin_",nbin_knz,".RDS",sep=''))
```

<!-- genarating Corl - Coru plots from non-parametric analysis with common + pseudo sp. in knz-->
```{r plot_res_knz_npa_spaceavg, echo=F, results="hide",cache=T,warning=F, cache.extra=list(seed,knz_spaceavg,resloc2,corstat_knz_spaceavg,nbin_knz,mtime("NonParamStat_matrixplot.R"),mtime("mycorrplot_with_sig.R"),mtime("tailsignif.R"),mtime("CopulaFunctions_flexible.R"))}
set.seed(seed)
source("./NonParamStat_matrixplot.R")

resloc<-resloc2

ub<-1/nbin_knz
numpts<-length(knz_spaceavg$avg.percent.cover[[1]]$Year) #for knz data 33years
numsims<-10000
CI<-c(0.025,0.975)
sigtest<-FALSE
include_indep<-FALSE

CorlmCoru_knz_spaceavg<-NonParamStat_matrixplot(data=corstat_knz_spaceavg,
                                                resloc=resloc,tagon=T,
                                                type="lower",wd=15,ht=15,
                                                sigtest=sigtest,ub=ub,numpts=numpts,numsims=numsims,CI=CI,
                                                include_indep=include_indep)
saveRDS(CorlmCoru_knz_spaceavg,paste(resloc,file="CorlmCoru_knz_spaceavg_nbin_",nbin_knz,".RDS",sep=''))
```

```{r binom_sigtest_knz, echo=F}
# only run this chunk if sigtest=TRUE in previous chunks
#source("./binomial_sigtest.R")
# set.seed(seed=101) # not needed as binom test uses two-sided binom.test function 
# also it does not matter here binom_sig="LT" or binom_sig="UT"
#sigtest_knz<-binomial_sigtest(ylist=CorlmCoru_knz_spaceavg,binom_sig="LT") 
```

```{r read knz_data_soiltype_t, echo=F} 
#do not cache
ts_mat_CP_knz<-readRDS("./Results/knz_results/skewness_results/ts_CP_knz_soiltype_t.RDS")
```

<!--get appropriate surrogates for knz with common+pseudo sp.-->
```{r make_surrogs_CP_knz_t, echo=F, results="hide", warning=F, cache=T,cache.extra=list(ts_mat_CP_knz,mtime("PPSurrogObjFun.R"),mtime("pwlin.R"),mtime("getmap.R"),mtime("alignranks.R"),mtime("SurrogsForKonza_t.R"))}

source("./SurrogsForKonza_t.R")
#after this script runs the surrogates will be in the variable surrogs_CP_KNZ_t
```

<!-- genarating stability based results and plots for knz-->
```{r skewness_knz_spaceavg_PP, echo=F, results="hide", warning=F, cache=T, cache.extra=list(seed,surrogs_CP_KNZ_t,ts_mat_CP_knz,mtime("make_tab_stability_assessment.R"),mtime("mycvsq.R"),mtime("SkewnessAnd3CentMom.R"))}

set.seed(seed)
source("make_tab_stability_assessment.R")

surrogs_CP_knz<-surrogs_CP_KNZ_t # available from SurrogsForKonza_t.R file

# randomly sample numsurrog surrogate matrices from Pearson preserving array
numsurrog<-10000
id_surrogs<-sample(c(1:dim(surrogs_CP_knz)[3]),numsurrog,replace=F)
surrogs_CP_knz_sampled<-surrogs_CP_knz[,,id_surrogs]

stability_CP_knz<-make_tab_stability(m=ts_mat_CP_knz,surrogs = surrogs_CP_knz_sampled, surrogs_given = T)
saveRDS(stability_CP_knz,"./Results/knz_results/skewness_results/stability_CP_knz.RDS")

ans<-(stability_CP_knz$df_stability)
rownames(ans)<-"C+P"
class(ans)

write.csv(ans,"./Results/knz_results/skewness_results/knz_stability_CP.csv")

#--------------generate plots with knz stability results : CVsq and skewness-------------------------------

pdf("./Results/knz_results/skewness_results/knz_pearson_preserving_results_cvsq_skw_plots.pdf",height=2.5,width=10)
op<-par(mfrow=c(1,2),mar=c(6,5,0.2,2),mgp=c(3,1,0))

#--------------CVsq histogrm-------------------------------------
xlm<-range(ans$cvsq_real,ans$cvsq_indep,stability_CP_knz$cvsq_surrogs)

hist(stability_CP_knz$cvsq_surrogs,col="grey",border=F,breaks=100,xaxt="n",xlim=xlm,
     xlab=expression(paste(CV^2," of surrogates: Konza")),main="",cex.lab=1.5)
axis(side=1, at=round(c(xlm[1],xlm[2]),3))
points(x=ans$cvsq_real,y=0,col="black",pch=20,cex=1)  # actual CVsq from real data
#abline(v=ans$cvsq_real,col="black") # actual CVsq from real data

#95% quantiles
abline(v=ans$cvsq_ntd_0.025CI,col="black",lty="dotted") 
abline(v=ans$cvsq_ntd_0.975CI,col="black",lty="dotted")

#50% quantiles
#CI_cvsq_50<-quantile(stability_CP_knz$cvsq_surrogs,probs=c(.25,.75))
#abline(v=CI_cvsq_50[1],col="green")
#abline(v=CI_cvsq_50[2],col="green")

# Cvsq with no tail dep.
points(x=ans$cvsq_ntd_median,y=0,col="black",pch=2,cex=1.5)

# Cvsq if indep.
abline(v=ans$cvsq_indep,col="black",lty="dashed")


#--------------skw histogrm-------------------------------------
xlm<-range(ans$skw_real,ans$skw_indep,stability_CP_knz$skw_surrogs)

hist(stability_CP_knz$skw_surrogs,col="grey",border=F,breaks=100,xaxt="n",xlim=xlm,
     xlab="Skewness of surrogates: Konza",main="",cex.lab=1.5)
axis(side=1, at=round(c(xlm[1],0,xlm[2]),3))
points(x=ans$skw_real,y=0,col="black",pch=20,cex=1) # actual skw from real data
#abline(v=ans$skw_real,col="black") # actual skw from real data

#95% quantiles
abline(v=ans$skw_ntd_0.025CI,col="black",lty="dotted") 
abline(v=ans$skw_ntd_0.975CI,col="black",lty="dotted")

#50% quantiles
#CI_skw_50<-quantile(stability_CP_knz$skw_surrogs,probs=c(.25,.75))
#abline(v=CI_skw_50[1],col="green")
#abline(v=CI_skw_50[2],col="green")

# Skewness with no tail dep.
points(x=ans$skw_ntd_median,y=0,col="black",pch=2,cex=1.5)

# Skewness if indep.
abline(v=ans$skw_indep,col="black",lty="dashed")

# add legend
#legend("topright",lty=c(1,NA,2,3),pch=c(NA,1,NA,NA),
#       horiz = F, bty="n",
#       legend=c("real value","no Tail-dep. (median)","95%CI","Indep."))
par(op)
dev.off()

pdf("./Results/knz_results/skewness_results/legend_plot.pdf",height=0.5,width=12.5)
op<-par(mar=c(0.1,0.1,0.1,0.1),mgp=c(1,1,0))

plot.new()
legend_order <- matrix(1:4,ncol=4,byrow =F)
legend("topright",lty=c(NA,NA,3,2)[legend_order],pch=c(20,2,NA,NA)[legend_order],
       bty="n", pt.cex=1.2, cex=1.2,
       legend=c("actual community value (com),",expression(paste("median (nta surrogates),")),"95% CI (nta surrogates),","no interaction (ind)")[legend_order],x.intersp=0, ncol=4)
#par(op)
dev.off()
```

```{r few_checks_on_surrogs_knz, echo=F, results="hide", cache=T, cache.extra=list(seed,ts_mat_CP_knz,surrogs_CP_KNZ_t,mtime("PPsurrogs_tests.R"),mtime("get_var_ratio.R"))}
set.seed(seed)
source("./PPsurrogs_tests.R")

# randomly sample numsurrog surrogate matrices from Pearson preserving array
numsurrog<-10000
id_surrogs<-sample(c(1:dim(surrogs_CP_KNZ_t)[3]),numsurrog,replace=F)
surrogs_CP_KNZ_t_sampled<-surrogs_CP_KNZ_t[,,id_surrogs]

ans<-PPsurrogs_tests(m=ts_mat_CP_knz, surrogs=surrogs_CP_KNZ_t_sampled)
#ans<-PPsurrogs_tests(m=ts_mat_CP_knz, surrogs=surrogs_CP_KNZ_t)
saveRDS(ans,"./Results/knz_results/skewness_results/pp_surrogs_knz_t_CP/PPsurrogs_tests_with_KNZtSurrogates.RDS")
```

```{r plotter_PPsurrogs_check_knz, echo=F, results="hide", cache=T, cache.extra=list(seed,ts_mat_CP_knz,mtime("./Results/knz_results/skewness_results/pp_surrogs_knz_t_CP/PPsurrogs_tests_with_KNZtSurrogates.RDS"),mtime("Plotter_PPsurrogs_tests.R"))}

set.seed(seed)
source("./Plotter_PPsurrogs_tests.R")

m<-ts_mat_CP_knz
ans<-readRDS("./Results/knz_results/skewness_results/pp_surrogs_knz_t_CP/PPsurrogs_tests_with_KNZtSurrogates.RDS")
resloc<-"./Results/knz_results/skewness_results/pp_surrogs_knz_t_CP/"

Plotter_PPsurrogs_tests(m=m,ans=ans,resloc=resloc,tag_legend = c("(B)","(D)","(F)"))
```

```{r for_Kathy, echo=F, results="hide"}

# Writing csv file without rownames is needed, so we place the first column with the year as in csv
# NOTE: ts_mat_cp_hays in RDS format only has species time series along column as it has rownames as years
# but in csv file I am creating an extra first column as "Year" as rownames for reading/writing in csv file 
# often creates column without any name

#------------ for Hays --------------------
ts_mat_CP_hays<-readRDS("./Results/hays_results/skewness_results/ts_mat_CP_hays.RDS")
Year_hays<-rownames(ts_mat_CP_hays)
ts_mat_CP_hays_for_Kathy<-cbind(Year=Year_hays,ts_mat_CP_hays)
row.names(ts_mat_CP_hays_for_Kathy)<-NULL
write.csv(ts_mat_CP_hays_for_Kathy,"./Results/hays_results/skewness_results/ts_mat_CP_hays_for_Kathy.csv",row.names = F)

# Now getting total species timeseries for Kathy
xtot_hays<-apply(ts_mat_CP_hays,MARGIN=1,FUN=sum)
xtot_hays<-cbind(Year=Year_hays,x_tot=xtot_hays)
write.csv(xtot_hays,"./Results/hays_results/skewness_results/xtotal_hays_for_Kathy.csv",row.names = F)

#-------- for KNZ ---------------
ts_mat_CP_knz<-readRDS("./Results/knz_results/skewness_results/ts_CP_knz_soiltype_t.RDS")
Year_knz<-rownames(ts_mat_CP_knz)
Year_knz<-substr(Year_knz,6,9)
ts_mat_CP_knz_for_Kathy<-cbind(Year=Year_knz,ts_mat_CP_knz)
row.names(ts_mat_CP_knz_for_Kathy)<-NULL
write.csv(ts_mat_CP_knz_for_Kathy,"./Results/knz_results/skewness_results/ts_mat_CP_knz_for_Kathy.csv",row.names = F)

# Now getting total species timeseries for Kathy
xtot_knz<-apply(ts_mat_CP_knz,MARGIN=1,FUN=sum)
xtot_knz<-cbind(Year=Year_knz,x_tot=xtot_knz)
write.csv(xtot_knz,"./Results/knz_results/skewness_results/xtotal_knz_for_Kathy.csv",row.names = F)
```

```{r for_skew_fig, echo=F, results="hide", cache=T, cache.extra=list(mtime("./Results/hays_results/skewness_results/ts_mat_CP_hays.RDS"),mtime("./Results/knz_results/skewness_results/ts_CP_knz_soiltype_t.RDS"),mtime("./Results/hays_results/skewness_results/pp_surrogs_hays_CP/HaysSurrogates.RDS"),mtime("./Results/knz_results/skewness_results/pp_surrogs_knz_t_CP/KNZtSurrogates.RDS"))}
#Here I generate a figure that shows the following:
#1.	The histogram for the total-abundance time series for Hays
#2.	The histogram for the total-abundance time series for a “typical” no-tail-association surrogate for Hays 
#(one single surrogate, but a typical one – I’ll have to find one that has a skewness about equal to the median 
#value and use that one)
#3.	The same two histograms for Konza

#Variables (created above) that will be used:
#ts_mat_CP_hays: this is the matrix for Hays, each column is a species, aggregated pseudo-species in the last 
#column, and row names equal to year. See the chunk "for_Kathy".
#ts_mat_CP_knz: Same for Konza. See the chunk "for_Kathy".
#surrogs_CP_hays: Pearson preserving surrogates for Hays, created in the r chunk named "make_surrogs_CP_hays"
#surrogs_CP_KNZ_t: same for Konza, created in the r chunk named "make_surrogs_CP_knz_t"

#In fact, the above variables are read in from where they were previously saved - makes these codes runnable 
#separately from the chunk structure of this doc, to be commented after debugging
ts_mat_CP_hays<- readRDS("./Results/hays_results/skewness_results/ts_mat_CP_hays.RDS")  
ts_mat_CP_knz<-readRDS("./Results/knz_results/skewness_results/ts_CP_knz_soiltype_t.RDS")
surrogs_CP_hays<-readRDS("./Results/hays_results/skewness_results/pp_surrogs_hays_CP/HaysSurrogates.RDS")
surrogs_CP_KNZ_t<-readRDS( "./Results/knz_results/skewness_results/pp_surrogs_knz_t_CP/KNZtSurrogates.RDS" )

#***plot dimensions, units inches
xmarht<-.5
ymarnumwd<-.25
ymarlabwd<-.25
ymarwd<-ymarnumwd+ymarlabwd
totwd<-6
gap<-0.2 #general purpose gap, aside from the below purposes
panwd<-(totwd-ymarwd-ymarnumwd-gap)/2
panht<-panwd/2
totht<-2*xmarht+3*panht+2*gap
pdf(file="./Results/EmpiricalParallelOfFig1.pdf",width=totwd,height=totht)

#make historgram of total abundance of Hays
tot_hays<-unname(apply(FUN=sum,MARGIN=1,X=as.matrix(ts_mat_CP_hays)))

par(fig=c((ymarwd)/totwd,
          (ymarwd+panwd)/totwd,
          (xmarht+panht+gap)/totht,
          (xmarht+2*panht+gap)/totht),
    mai=c(0,0,0,0),mgp=c(3,.15,0),tcl=-.25)
haysbreaks<-seq(from=2000,to=9000,by=1000)
h<-hist(tot_hays,main="",breaks=haysbreaks)
rug(tot_hays) #does not currently work in R markdown, but I hope it'll work when results are exported to a pdf instead
mtext("Frequency",2,1.2)
text(min(haysbreaks),max(h$counts),"B",cex=1.2,adj=c(0,1))
text(max(haysbreaks),.7*max(h$counts),paste0("sk=",round(myskns(tot_hays),3)),cex=.8,adj=c(1,1))

#calculate skewnesses of the totals of first 1000 surrogates, find the index of a close-to-median value
tot_hays_surrs<-apply(FUN=sum,MARGIN=c(1,3),X=surrogs_CP_hays[,,1:1000])
tot_hays_surrs_skns<-apply(FUN=myskns,MARGIN=2,X=tot_hays_surrs)
h<-abs(tot_hays_surrs_skns-median(tot_hays_surrs_skns))
ind_hays<-which(h==min(h))
ind_hays<-ind_hays[1]
#tot_hays_surrs_skns[ind_hays]

#make histogram of total abundance of the selected Hays surrogate
tot_hays_surr<-tot_hays_surrs[,ind_hays]

par(fig=c((ymarwd)/totwd,
          (ymarwd+panwd)/totwd,
          (xmarht)/totht,
          (xmarht+panht)/totht),
    mai=c(0,0,0,0),mgp=c(3,.15,0),tcl=-.25,new=TRUE)
h<-hist(tot_hays_surr,main="",breaks=haysbreaks)
mtext("Abundance",1,1.2)
mtext("Frequency",2,1.2)
text(min(haysbreaks),max(h$counts),"C",cex=1.2,adj=c(0,1))
rug(tot_hays_surr) 
text(max(haysbreaks),.7*max(h$counts),paste0("sk=",round(myskns(tot_hays_surr),3)),cex=.8,adj=c(1,1))

#plot time series for hays
yr_hays<-as.numeric(rownames(ts_mat_CP_hays))

par(fig=c((ymarwd)/totwd,
          (ymarwd+panwd)/totwd,
          (2*xmarht+2*panht+gap)/totht,
          (2*xmarht+3*panht+gap)/totht),
    mai=c(0,0,0,0),mgp=c(3,.15,0),tcl=-.25,new=TRUE)
ylimits<-range(tot_hays,tot_hays_surr)
ylimits[2]<-ylimits[2]+.25*diff(ylimits)
plot(yr_hays,tot_hays,type="l",xlab="Year",ylab="Total abundance",ylim=ylimits)
lines(yr_hays,tot_hays_surr,type="l",lty="dashed")
mtext("Year",1,1.2)
mtext("Abundance",2,1.2)
text(min(yr_hays),max(ylimits),"A",cex=1.2,adj=c(0,1))
lines(range(yr_hays),rep(7800,2),type="l",lty="dotted")

#make historgram of total abundance of Konza
tot_knz<-unname(apply(FUN=sum,MARGIN=1,X=as.matrix(ts_mat_CP_knz)))

par(fig=c((ymarwd+panwd+ymarnumwd)/totwd,
          (ymarwd+2*panwd+ymarnumwd)/totwd,
          (xmarht+panht+gap)/totht,
          (xmarht+2*panht+gap)/totht),
    mai=c(0,0,0,0),mgp=c(3,.15,0),tcl=-.25,new=TRUE)
knzbreaks<-seq(from=100,to=240,by=20)
h<-hist(tot_knz,main="",breaks=knzbreaks)
rug(tot_knz)
text(min(knzbreaks),max(h$counts),"E",cex=1.2,adj=c(0,1))
text(max(knzbreaks),.7*max(h$counts),paste0("sk=",round(myskns(tot_knz),3)),cex=.8,adj=c(1,1))

#calculate skewnesses of the totals of first 1000 surrogates, find the index of a close-to-median value
tot_knz_surrs<-apply(FUN=sum,MARGIN=c(1,3),X=surrogs_CP_KNZ_t[,,1:1000])
tot_knz_surrs_skns<-apply(FUN=myskns,MARGIN=2,X=tot_knz_surrs)
h<-abs(tot_knz_surrs_skns-median(tot_knz_surrs_skns))
h<-order(h)
#ind_knz<-which(h==min(h))
ind_knz<-h[5]
tot_knz_surrs_skns[ind_knz]

#make histogram of total abundance of the selected Konza surrogate
tot_knz_surr<-tot_knz_surrs[,ind_knz]

par(fig=c((ymarwd+panwd+ymarnumwd)/totwd,
          (ymarwd+2*panwd+ymarnumwd)/totwd,
          (xmarht)/totht,
          (xmarht+panht)/totht),
    mai=c(0,0,0,0),mgp=c(3,.15,0),tcl=-.25,new=TRUE)
h<-hist(tot_knz_surr,main="",breaks=knzbreaks)
rug(tot_knz_surr) 
mtext("Abundance",1,1.2)
text(min(knzbreaks),max(h$counts),"F",cex=1.2,adj=c(0,1))
text(max(knzbreaks),.7*max(h$counts),paste0("sk=",round(myskns(tot_knz_surr),3)),cex=.8,adj=c(1,1))

#plot time series for Konza
yr_knz<-rownames(ts_mat_CP_knz)
yr_knz<-as.numeric(substr(yr_knz,6,9))

par(fig=c((ymarwd+panwd+ymarnumwd)/totwd,
          (ymarwd+2*panwd+ymarnumwd)/totwd,
          (2*xmarht+2*panht+gap)/totht,
          (2*xmarht+3*panht+gap)/totht),
    mai=c(0,0,0,0),mgp=c(3,.15,0),tcl=-.25,new=TRUE)
ylimits<-range(tot_knz,tot_knz_surr)
ylimits[2]<-ylimits[2]+.25*diff(ylimits)
plot(yr_knz,tot_knz,type="l",xlab="Year",ylab="Total abundance",ylim=ylimits)
mtext("Year",1,1.2)
lines(yr_knz,tot_knz_surr,type="l",lty="dashed")
lines(range(yr_knz),rep(125,2),type="l",lty="dotted")
text(min(yr_knz),max(ylimits),"D",cex=1.2,adj=c(0,1))

dev.off()
```

```{r pedagog_fig, echo=F, results="hide"}
set.seed(seed)
#source("./pedagog_ts.R")

#------------------------------------------------------------------
# another pedagog fig. showing LT, UT with same correlation
library(copula)
library(VineCopula)
par_rho_0.8<-copula::iRho(claytonCopula(5),rho=0.8)
ccop<-BiCopSim(200,3,par=par_rho_0.8)
sccop<-BiCopSim(200,13,par=par_rho_0.8)
#pdf("./Results/pedagog_figs/LTUT_rho_0.8.pdf",height=3,width=6)
#op<-par(mfrow=c(1,2),mar=c(3.5, 3.5, 0.1, 1.1),mgp=c(1.8,0.5,0))
#plot(ccop[,1],ccop[,2],col=rgb(0,0,0,0.1),pch=20,xlab=expression("u"[i]),ylab=expression("u"[j]),cex.lab=1.5,cex.axis=0.8,xaxs="i",yaxs="i")
#lines(c(0,0.3),c(0.3,0),type='l')
#lines(c(0,0.6),c(0.6,0),type='l')	
#ind1<-which(ccop[,1]+ccop[,2]>0.3 & ccop[,1]+ccop[,2]<0.6)
#points(ccop[ind1,1],ccop[ind1,2],col=rgb(1,0,0,0.1),pch=20)
#lines(c(0,1),c(1,0),type='l',lty=2)

#lines(c(0,1.4),c(1.4,0),type='l')	
#lines(c(0,1.7),c(1.7,0),type='l')	
#ind1<-which(ccop[,1]+ccop[,2]>1.4 & ccop[,1]+ccop[,2]<1.7)
#points(ccop[ind1,1],ccop[ind1,2],col=rgb(0,0,1,0.1),pch=20)

#text(x=0.1,y=0.9,"(A)",cex=1.3, family="serif")

#plot(sccop[,1],sccop[,2],col=rgb(0,0,0,0.1),pch=20,xlab=expression("u"[i]),ylab=expression("u"[j]),cex.lab=1.5,cex.axis=0.8,xaxs="i",yaxs="i")
#lines(c(0,0.3),c(0.3,0),type='l')
#lines(c(0,0.6),c(0.6,0),type='l')	
#ind1<-which(sccop[,1]+sccop[,2]>0.3 & sccop[,1]+sccop[,2]<0.6)
#points(sccop[ind1,1],sccop[ind1,2],col=rgb(1,0,0,0.1),pch=20)

#lines(c(0,1.4),c(1.4,0),type='l')	
#lines(c(0,1.7),c(1.7,0),type='l')	
#ind1<-which(sccop[,1]+sccop[,2]>1.4 & sccop[,1]+sccop[,2]<1.7)
#points(sccop[ind1,1],sccop[ind1,2],col=rgb(0,0,1,0.1),pch=20)
#lines(c(0,1),c(1,0),type='l',lty=2)

#text(x=0.1,y=0.9,"(B)",cex=1.3, family="serif")

#par(op)
#box(which="figure")
#dev.off()

#------------------------------------------------------------------
#Another version of the same thing, but splitting the two panels into two figures and
#also making some other presentational tweaks

panht<-1
panwd<-1
xht<-.4
ywd<-.4
gap<-.05
totwd<-ywd+panwd+gap
totht<-xht+panht+gap

bl<-.1
bu<-.25

pdf("./Results/pedagog_figs/LTUT_rho_0p8_LTdep.pdf",width=totwd,height=totht)
par(fig=c((ywd)/totwd,
          (ywd+panwd)/totwd,
          (xht)/totht,
          (xht+panht)/totht),
    mai=c(0,0,0,0),mgp=c(3,.15,0),tcl=-.25)

ccop<-copula::pobs(ccop)
plot(ccop[,1],ccop[,2],pch=20,xaxt="n",yaxt="n",cex=.5) 
lines(c(0,2*bl),c(2*bl,0),type='l')
lines(c(0,2*bu),c(2*bu,0),type='l')	
lines(c(0,1),c(0,1),lty="solid")
axis(side=1,labels=TRUE,cex.axis=1)
axis(side=2,labels=TRUE,cex.axis=1)
mtext(expression(u[i]),side=1,line=1,cex=1)
mtext(expression(u[j]),side=2,line=1,cex=1)

dev.off()

pdf("./Results/pedagog_figs/LTUT_rho_0p8_UTdep.pdf",width=totwd,height=totht)
par(fig=c((ywd)/totwd,
          (ywd+panwd)/totwd,
          (xht)/totht,
          (xht+panht)/totht),
    mai=c(0,0,0,0),mgp=c(3,.15,0),tcl=-.25)

sccop<-copula::pobs(sccop)
plot(sccop[,1],sccop[,2],pch=20,xaxt="n",yaxt="n",cex=.5) 
lines(c(0,1),c(0,1),lty="solid")
axis(side=1,labels=TRUE,cex.axis=1)
axis(side=2,labels=TRUE,cex.axis=1)
mtext(expression(u[i]),side=1,line=1,cex=1)
mtext(expression(u[j]),side=2,line=1,cex=1)

dev.off()

#------------------------------------------------------------------
# Another type of pedagog fig
source("./pedagog_cv2_skw.R") # we used seed=104 in this source function
```

```{r theory_fig, echo=F, results="hide"}
source("./TheoryFig202005_MT.R") #We used seed=101 in this source function. This makes the stripped down, main text version.
source("./TheoryFig202005.R") #We used seed=101 in this source function
```

<!--Investigations of empirical cdfs, added 2020 06-->
```{r empcdfs, echo=FALSE, results="hide"}
#Organize the data from earlier chunks into the format I want to analyze it in in this chunk
dhays<-as.matrix(ts_mat_CP_hays) #dhays is now a matrix with one column for each species/pseudo-species in hays,
#one row for each year, and with species names in the column names, and years in
#the row names
dkonz<-as.matrix(ts_mat_CP_knz)  #similar for dkonz, except it's for Konza

#Organize the surrogates from earlier chunks into the format I want to analyze it in in this chunk - already done
#in earlier code. So these are years by species/pseudo-species by 10000 surrogates. These are nta surrogates.
shays<-surrogs_CP_hays_sampled
skonz<-surrogs_CP_knz_sampled

#Also get so-called independent surrogates - randomly permute each species time series, independently of the 
#others, to make one of these time series
sihays<-array(dhays,dim=c(dim(dhays)[1],dim(dhays)[2],dim(shays)[3]))
sihays<-apply(FUN=function(x){return(sample(x,length(x),FALSE))},X=sihays,MARGIN=c(2,3))
sikonz<-array(dkonz,dim=c(dim(dkonz)[1],dim(dkonz)[2],dim(skonz)[3]))
sikonz<-apply(FUN=function(x){return(sample(x,length(x),FALSE))},X=sikonz,MARGIN=c(2,3))

#xtot for real data and each surrogate
dhays_xtot<-unname(apply(FUN=sum,X=dhays,MARGIN=1))
dkonz_xtot<-unname(apply(FUN=sum,X=dkonz,MARGIN=1))
shays_xtot<-unname(apply(FUN=sum,X=shays,MARGIN=c(1,3)))
skonz_xtot<-unname(apply(FUN=sum,X=skonz,MARGIN=c(1,3)))
sihays_xtot<-unname(apply(FUN=sum,X=sihays,MARGIN=c(1,3)))
sikonz_xtot<-unname(apply(FUN=sum,X=sikonz,MARGIN=c(1,3)))

#empical cdfs for real data
dhays_ecdf<-cbind(sort(dhays_xtot),(1:length(dhays_xtot))/length(dhays_xtot))
colnames(dhays_ecdf)<-c("x","y")
dkonz_ecdf<-cbind(sort(dkonz_xtot),(1:length(dkonz_xtot))/length(dkonz_xtot))
colnames(dkonz_ecdf)<-c("x","y")

#empirical cdfs for surrogates
shays_ecdf<-matrix(NA,dim(shays_xtot)[1],dim(shays_xtot)[2]+1)
for (counter in 1:(dim(shays_xtot)[2]))
{
  shays_ecdf[,counter]<-sort(shays_xtot[,counter])
}
shays_ecdf[,counter+1]<-(1:(dim(shays_xtot)[1]))/(dim(shays_xtot)[1])
colnames(shays_ecdf)<-c(paste0("x",1:(dim(shays_xtot)[2])),"y")

skonz_ecdf<-matrix(NA,dim(skonz_xtot)[1],dim(skonz_xtot)[2]+1)
for (counter in 1:(dim(skonz_xtot)[2]))
{
  skonz_ecdf[,counter]<-sort(skonz_xtot[,counter])
}
skonz_ecdf[,counter+1]<-(1:(dim(skonz_xtot)[1]))/(dim(skonz_xtot)[1])
colnames(skonz_ecdf)<-c(paste0("x",1:(dim(skonz_xtot)[2])),"y")

sihays_ecdf<-matrix(NA,dim(sihays_xtot)[1],dim(sihays_xtot)[2]+1)
for (counter in 1:(dim(sihays_xtot)[2]))
{
  sihays_ecdf[,counter]<-sort(sihays_xtot[,counter])
}
sihays_ecdf[,counter+1]<-(1:(dim(sihays_xtot)[1]))/(dim(sihays_xtot)[1])
colnames(sihays_ecdf)<-c(paste0("x",1:(dim(sihays_xtot)[2])),"y")

sikonz_ecdf<-matrix(NA,dim(sikonz_xtot)[1],dim(sikonz_xtot)[2]+1)
for (counter in 1:(dim(sikonz_xtot)[2]))
{
  sikonz_ecdf[,counter]<-sort(sikonz_xtot[,counter])
}
sikonz_ecdf[,counter+1]<-(1:(dim(sikonz_xtot)[1]))/(dim(sikonz_xtot)[1])
colnames(sikonz_ecdf)<-c(paste0("x",1:(dim(sikonz_xtot)[2])),"y")

#For each y axis value, evaluate what fraction of corresponding surrogate x-axis values that
#are less than the corresponding data x axis value. This is the number of surrogate ecdfs
#that surpass that y axis value before the data ecdf does. So when this number is big,
#e.g., bigger than 0.95 or 0.975, then the quantile corresponding to the y axis values
#is passed earlier for surrogates than it is for real data. When this number is
#small, e.g., less than 0.025 or 0.05, then the quantile corresponding to the y axis values
#is passed later for surrogates than it is for real data.
quants_hays_nta<-NA*numeric(dim(dhays_ecdf)[1])
for (counter in 1:(dim(dhays_ecdf)[1]))
{
  quants_hays_nta[counter]<-sum(shays_ecdf[counter,1:(dim(shays_ecdf)[2]-1)]<dhays_ecdf[counter,1])/(dim(shays_ecdf)[2]-1)
}

quants_konz_nta<-NA*numeric(dim(dkonz_ecdf)[1])
for (counter in 1:(dim(dkonz_ecdf)[1]))
{
  quants_konz_nta[counter]<-sum(skonz_ecdf[counter,1:(dim(skonz_ecdf)[2]-1)]<dkonz_ecdf[counter,1])/(dim(skonz_ecdf)[2]-1)
}

quants_hays_ind<-NA*numeric(dim(dhays_ecdf)[1])
for (counter in 1:(dim(dhays_ecdf)[1]))
{
  quants_hays_ind[counter]<-sum(sihays_ecdf[counter,1:(dim(sihays_ecdf)[2]-1)]<dhays_ecdf[counter,1])/(dim(sihays_ecdf)[2]-1)
}

quants_konz_ind<-NA*numeric(dim(dkonz_ecdf)[1])
for (counter in 1:(dim(dkonz_ecdf)[1]))
{
  quants_konz_ind[counter]<-sum(sikonz_ecdf[counter,1:(dim(sikonz_ecdf)[2]-1)]<dkonz_ecdf[counter,1])/(dim(sikonz_ecdf)[2]-1)
}

quants_hays_nta
quants_hays_ind
quants_konz_nta
quants_konz_ind

#now make some plots

#Makes plots comparing empirical and surrogate distributions of xtot
#
#Args
#ecdf       The empirical ecdf, dhays_ecdf or dkonz_ecdf from above
#secdf      The surrogate ecdfs, nta surrogs, shays_ecdf or skonz_ecdf from above
#siecdf     The surrogate ecdfs, indep surrogs, sihays_ecdf or sikonz_ecdf from above
ecdfplot<-function(ecdf,secdf,siecdf,filename)
{
  #set up the plot, plotting dimensions units inches
  xmarght<-.5
  ymargwd<-.5
  gap<-0.1
  panwd<-5
  totwd<-ymargwd+panwd+gap
  panht<-.6*panwd
  totht<-xmarght+2*panht+2*gap
  pdf(file=paste0(filename,".pdf"),width=totwd,height=totht)

  #first panel, uses nta surrogates
  par(fig=c((ymargwd)/totwd,
            (ymargwd+panwd)/totwd,
            (xmarght+panht+gap)/totht,
            (xmarght+2*panht+gap)/totht),
      mai=c(0,0,0,0),mgp=c(3,.15,0),tcl=-.25)
  
  #plot the data
  xlimits<-range(secdf[,1:(dim(secdf)[2]-1)],siecdf[,1:(dim(siecdf)[2]-1)])
  ylimits<-c(0,1)
  xlimits_exp<-xlimits
  xlimits_exp[2]<-xlimits[2]+.3*diff(xlimits)
  plot(ecdf[,"x"],ecdf[,"y"],col='red',type="p",ylim=ylimits,pch=20,
       xlab=expression(x[tot]),xaxt="n",
       xlim=xlimits_exp,cex=.5)
  mtext(expression(rank(x[tot])/T),side=2,line=1.1)
  axis(1,labels=FALSE)
  text(xlimits[1],ylimits[2],"A",adj=c(0,1))
  
  #plot distributions of surrogates, nta surrogates
  lines(apply(FUN=min,X=secdf[,1:(dim(secdf)[2]-1)],MARGIN=1),secdf[,"y"],lty='dotted')
  lines(apply(FUN=quantile,X=secdf[,1:(dim(secdf)[2]-1)],MARGIN=1,probs=.025),secdf[,"y"],lty='dashed')
  #lines(apply(FUN=quantile,X=secdf[,1:(dim(secdf)[2]-1)],MARGIN=1,probs=.25),secdf[,"y"],lty='solid')
  lines(apply(FUN=quantile,X=secdf[,1:(dim(secdf)[2]-1)],MARGIN=1,probs=.5),secdf[,"y"],lty='solid')
  #lines(apply(FUN=quantile,X=secdf[,1:(dim(secdf)[2]-1)],MARGIN=1,probs=.75),secdf[,"y"],lty='solid')
  lines(apply(FUN=quantile,X=secdf[,1:(dim(secdf)[2]-1)],MARGIN=1,probs=.975),secdf[,"y"],lty='dashed')
  lines(apply(FUN=max,X=secdf[,1:(dim(secdf)[2]-1)],MARGIN=1),secdf[,"y"],lty='dotted')

  #plot the numbers
  for (counter in 1:(dim(ecdf)[1]))
  {
    h<-sum(secdf[counter,1:(dim(secdf)[2]-1)]<ecdf[counter,1])/(dim(secdf)[2]-1)
    if (counter %% 2 ==0)
    {
      if (h>.95 || h<.05)
      {
        text(xlimits_exp[2],ecdf[counter,"y"],round(h,3),adj=c(1,.5),cex=.75,col='red')
      }else
      {
        text(xlimits_exp[2],ecdf[counter,"y"],round(h,3),adj=c(1,.5),cex=.75)
      }
    }else
    {
      if (h>.95 || h<.05)
      {
        text(xlimits[2],ecdf[counter,"y"],round(h,3),adj=c(0,.5),cex=.75,col="red")
      } else
      {
        text(xlimits[2],ecdf[counter,"y"],round(h,3),adj=c(0,.5),cex=.75)
      }
    }
  }

  #second panel, uses ind surrogates
  par(fig=c((ymargwd)/totwd,
            (ymargwd+panwd)/totwd,
            (xmarght)/totht,
            (xmarght+panht)/totht),
      mai=c(0,0,0,0),mgp=c(3,.15,0),tcl=-.25,new=TRUE)
  
  #plot the data again
  plot(ecdf[,"x"],ecdf[,"y"],col='red',type="p",ylim=ylimits,pch=20,
       xlab=expression(x[tot]),
       xlim=xlimits_exp,cex=0.5)
  mtext(expression(rank(x[tot])/T),side=2,line=1.1)
  mtext(expression(x[tot]),side=1,line=1.1)
  text(xlimits[1],ylimits[2],"B",adj=c(0,1))
  
  #plot distributions of surrogates, ind surrogates
  lines(apply(FUN=min,X=siecdf[,1:(dim(secdf)[2]-1)],MARGIN=1),siecdf[,"y"],lty='dotted')
  lines(apply(FUN=quantile,X=siecdf[,1:(dim(siecdf)[2]-1)],MARGIN=1,probs=.025),siecdf[,"y"],lty='dashed')
  #lines(apply(FUN=quantile,X=siecdf[,1:(dim(siecdf)[2]-1)],MARGIN=1,probs=.25),siecdf[,"y"],lty='solid')
  lines(apply(FUN=quantile,X=siecdf[,1:(dim(siecdf)[2]-1)],MARGIN=1,probs=.5),siecdf[,"y"],lty='solid')
  #lines(apply(FUN=quantile,X=siecdf[,1:(dim(siecdf)[2]-1)],MARGIN=1,probs=.75),siecdf[,"y"],lty='solid')
  lines(apply(FUN=quantile,X=siecdf[,1:(dim(siecdf)[2]-1)],MARGIN=1,probs=.975),siecdf[,"y"],lty='dashed')
  lines(apply(FUN=max,X=siecdf[,1:(dim(siecdf)[2]-1)],MARGIN=1),siecdf[,"y"],lty='dotted')
    
  #plot the numbers
  for (counter in 1:(dim(ecdf)[1]))
  {
    h<-sum(siecdf[counter,1:(dim(siecdf)[2]-1)]<ecdf[counter,1])/(dim(siecdf)[2]-1)
    if (counter %% 2 ==0)
    {
      if (h>.95 || h<.05)
      {
        text(xlimits_exp[2],ecdf[counter,"y"],round(h,3),adj=c(1,.5),cex=.75,col='red')
      }else
      {
        text(xlimits_exp[2],ecdf[counter,"y"],round(h,3),adj=c(1,.5),cex=.75)
      }
    }else
    {
      if (h>.95 || h<.05)
      {
        text(xlimits[2],ecdf[counter,"y"],round(h,3),adj=c(0,.5),cex=.75,col="red")
      } else
      {
        text(xlimits[2],ecdf[counter,"y"],round(h,3),adj=c(0,.5),cex=.75)
      }
    }
  }
  
  dev.off() 
}

ecdfplot(ecdf=dhays_ecdf,secdf=shays_ecdf,siecdf=sihays_ecdf,filename="./Results/Hays_ecdfs")
ecdfplot(ecdf=dkonz_ecdf,secdf=skonz_ecdf,siecdf=sikonz_ecdf,filename="./Results/Konz_ecdfs")
```

# Details of the simulations for Fig. \ref{MT-fig_pedag_cv2_skw}\label{PedagogFigMethods}

The populations of Fig. \ref{MT-fig_pedag_cv2_skw} were simulated as follows. Let $\Sigma$ be the $11 \times 11$
matrix with $1$s on the diagonal, $\Sigma_{1j}=\Sigma_{j1}=-0.2$ for all $j\neq 1$, and all 
other entries equal to $0.1$. Define the random vector $(w_0,w_1,\ldots,w_{10})$ as what one gets
by drawing a random vector from the $11$-dimensional multivariate normal distribution with mean 
$(0,\ldots,0)$ and covariance matrix $\Sigma$, and then applying the cumulative distribution function
of a univariate standard normal distribution to each component. The univariate marginal distributions 
of $(w_0,\ldots,w_{10})$ are uniform distributions on the unit interval. 

Next, for the populations of Fig. \ref{MT-fig_pedag_cv2_skw}A, 
define a random vector $(u_1,u_2,\ldots,u_{20})$ as follows. First draw a uniformly distributed 
random variable, $f$, on the unit interval. Then generate a random vector $(w_0,\ldots,w_{10})$
as described in the previous paragraph. Then, if $f<\frac{1}{2}$, let $u_1,\ldots,u_{10}$ all equal
$1/2-w_0/2$ and let $u_{11}=1-w_1/2,u_{12}=1-w_2/2,\ldots,u_{20}=1-w_{10}/2$. If $f \geq \frac{1}{2}$, instead let
$u_{11},\ldots,u_{20}$ all equal $1/2-w_0/2$ and let $u_{1}=1-w_1/2,u_{2}=1-w_2/2,\ldots,u_{10}=1-w_{10}/2$.

For the populations of Fig. \ref{MT-fig_pedag_cv2_skw}B, 
define a random vector $(u_1,u_2,\ldots,u_{20})$ as follows. First draw a uniformly distributed 
random variable, $f$, on the unit interval. Then generate a random vector $(w_0,\ldots,w_{10})$
as described in the above paragraph. Then, if $f<\frac{1}{2}$, let $u_1,\ldots,u_{10}$ all equal
$w_0/2+1/2$ and let $u_{11}=w_1/2,u_{12}=w_2/2,\ldots,u_{20}=w_{10}/2$. If $f \geq \frac{1}{2}$, instead let
$u_{11},\ldots,u_{20}$ all equal $w_0/2+1/2$ and let $u_{1}=w_1/2,u_{2}=w_2/2,\ldots,u_{10}=w_{10}/2$.

Next, get the vector $(q_1(t),\ldots,q_{20}(t))$ for each time step $t$ by generating an independent
vector $(u_1,\ldots,u_{20})$ as described in one of the previous two paragraphs, and then applying the inverse of the 
cumulative distribution function of a univariate standard normal distribution to each component. This makes
the marginal distribution for each species standard normal. Population time series were obtained by 
shifting all the $q_i(t)$ vertically to make them all positive and to make means the same in the two cases
considered in Fig. \ref{MT-fig_pedag_cv2_skw}.

# Details of the simulations for the Theory section and extended results \label{TheoryMethods}

The random variables $(x_1,x_2,\ldots,x_N)$ of Fig. \ref{MT-fig_theory} and SI Fig. \ref{fig_theory_SM} 
were generated as follows. Let $0 \leq \beta_1 < \beta_2 \leq 1$
be thresholds which will control the tails in which the $x_i$ are perfectly related, and let $0 \leq \rho \leq 1$ be a correlation
parameter which will control the strength of the correlation between the components, $x_i$, outside of the perfectly correlated tails.
Let $\Sigma$ be a covariance matrix with $1$s on the diagonal and $\rho$ in all off-diagonal entries. Let 
$(u_1,\ldots,u_N)$ be a random vector obtained by generating points from an $N$-dimensional multivariate normal distribution with 
mean $(0,\ldots,0)$ and covariance matrix $\Sigma$ and then applying the cumulative distribution function of the univariate
standard normal distribution to each component, to uniformize the marginals. Let $f$ be a uniformly distributed random variable
on the unit interval, and use it and the $u_i$ to define random variables $v_i$ as follows. If $f<\beta_1$ or $f>\beta_2$, let 
$v_i=f$ for all $i$. Otherwise let $v_i=u_i/(\beta_2-\beta_1)+\beta_1$.
Then get $x_i$ by applying the inverse cumulative distribution function of a univariate standard normal distribution to the $v_i$.
For scenario 1, $\rho=0.6$, $\beta_1=0$, and $\beta_2=1$. For scenario 2, $\beta_1=0$, $\beta_2=0.95$; for scenario 3, 
$\beta_1=0.05$, $\beta_2=1$; and for scenario 4, $\beta_1=0.05$, $\beta_2=0.95$. For each of scenarios 2-4 (seperately),
$\rho$ was chosen numerically for that scenario so that the covariance of $x_i$ and $x_j$ was the same as in scenario 1. 

Interpretations of scenario 2 and comparisons to scenarios 1 and C are in the main text. We here provide 
interpretations of scenarios 3 and 4, briefly because interpretations are similar to scenario 2. 
Scenario 3 is symmetrically like scenario 2, but
with comonotonicity in the left tails instead of the right (Fig.  \ref{fig_theory_SM}M).
Comonotonicity in the lower tails produces left/negative skewness in $\xtot$ (Fig.  \ref{fig_theory_SM}N), 
which in turn translates to elevated probabilities that $\xtot$ will fall below low disaster thresholds (Fig. \ref{fig_theory_SM}P).
For sufficiently low thresholds, the probability is the same as for scenario C, and is therefore maximal for the 
species marginal distributions used. Although it does not appear unstable to classic approaches, the scenario 3 community
is maximally unstable with respect to low disaster thresholds.
The dashed lines on Fig. \ref{fig_theory_SM}N-P are copies of the same-color 
solid lines from above panels, reproduced to facilitate comparison. 

Scenario 4 shows comonotonicity in both tails simultaneously (Fig.  \ref{fig_theory_SM}Q), causing
both extreme low and high values to occur in the distribution of $\xtot$ (Fig. \ref{fig_theory_SM}R).
This does not produce skewness in $\xtot$, because the effect is symmetric, but it does result in elevated probabilities,
relative to scenario 1, both that $\xtot$ will exceed high disaster thresholds (Fig. \ref{fig_theory_SM}S) and that it will fall below
low disaster thresholds (Fig. \ref{fig_theory_SM}T). Probabilities of exceeding sufficiently high disaster 
thresholds or falling below sufficiently low ones are, in fact, the same as scenario C, and hence are maximal,
given fixed species marginal distributions. Although it does not appear unstable to classic approaches, the scenario 4 community
is maximally unstable with respect to both low and high disaster thresholds. The dashed lines on 
Fig. \ref{fig_theory_SM}R-T are copies of the same-color 
solid lines from above panels, reproduced to facilitate comparison.

# Details of the data \label{Data} 
<!--Hays data-->
Our first dataset comprised basal cover of vascular plants mapped in permanent quadrats in 
a Kansas mixed-grass prairie every year from 1932-1972 [@adler2007long]. Researchers from 
Fort Hays State University, in Hays, Kansas, mapped all individual plants in a series 
of 1 sq. meter quadrats in a mixed grass prairie (38.8$^\circ$N, 99.3$^\circ$W) 
[@albertson1937ecology; @albertson1965vegation].
These data were later digitized [@adler2007long] and are now available
online (\url{http://esapubs.org/archive/ecol/E088/161/}). These data were widely used in 
earlier studies to understand plant demographic processes. 
Here we give a brief description of the data.
The file `allrecords.csv` is basal cover (area in $\text{cm}^2$) data for 151 unique species (when observed) 
recorded for 41 years from 51 (1m $\times$ 1m) quadrats.
Information on each unique species occurring in the entire `allrecords.csv` dataset 
was stored in `species_list.csv` file, which had column names like 'type', 'PLANTS.Symbol',
'PLANTS.Synonym' (for 
explanations, see the `metadata file` at http://esapubs.org/archive/ecol/E088/161/metadata.htm). 
5 of 151 species were labelled as belonging to a 'remove' category as 
mentioned in 'type' column of `species_list.csv`. They were non-plant 
categories like 'Bare ground' or 'Fragment', or were unidentified or poorly resolved species like
'Mixed grass', 'Polygonum spp.' (which only occurred once), and 'Unknown'. Among 51 quadrats, 36 permanent 
quadrats were arranged along a gradient 
of soil depth and located in two livestock exclosures ('e1' and 'e2'). The remaining 15 quadrats 
were located in grazed areas outside the exclosures, mostly in the 
shortgrass community. Locations of the 51 quadrats were recorded as (quadX $cm$, quadY $cm$)
in the East-West and North-South directions, respectively, relative to the south-west corner 
of exclosure 'e1' (see `quadrat_info.csv`). Each quadrat has polygons with ID numbers processed 
in GIS tracking positions for individual plants. Each polygon location was 
recorded using the centroid co-ordinate (x $cm$, y $cm$) of the corresponding 
polygon (see `allrecords.csv`). For our purposes, we computed basal 
cover time-series (1932-1972) averaged over the 36 quadrats within the exclosure 
for each of 146 species (excluding the 'remove' category from described above). 

<!--add description about KNZ data-->
Our second dataset comprised plant species composition data from the Konza Prairie LTER site (http://lter.konza.ksu.edu/content/pvc02-plant-species-composition-selected-watersheds-konza-prairie). Data 
were accessed on November, 2019 [@Hartnett2019]. The data 
collection is an ongoing project (1983 onwards)
where canopy coverage and frequency have been recorded for different 
burn treatments in ungrazed and grazed watersheds located in 
different topographic sites (upland 
topographic locations - florence soil, lowland topographic 
locations - tully soils, and slope locations). Only locations with florence soils and tully soils were 
sampled every year (1983-). For our purposes we used data from watershed 001d as 
it is the only subset of the data operated over the longest period (1983-) in an ungrazed, 
annually burned watershed (for details see fire-history information at http://lter.konza.ksu.edu/content/kfh011). 
In this watershed, four 50 m long transects (A, B, C, and D) were established on each
topographic position; and within each transect, five evenly spaced, permanently 
marked plots are located (so there were a total of 20 plots for each topographic position). Canopy cover data are reported
in **7** classes (**1** <1%; **2** 1-5%; **3** 5-25%; **4** 25-50%; **5** 50-75%; **6** 75-95%; **7** 95-100%).
For species sampled on multiple census dates 
in a year (sampling was initially done three times a year, but from 1991 onwards 
sampling frequency dropped to twice a year) we used the highest cover class for each plot. Finally, the 
percent cover for each plot was computed by averaging the mid-points of the cover classes for the 20 plots. This is in accordance with the methods used in raw cover data processing (see 'methods manual for PVC02', http://lter.konza.ksu.edu/sites/default/files/MM.pdf). We presented our result considering average canopy cover sampled from tully soils only, as we could not produce 
suitable Pearson correlation preserving surrogate data for the other soil type (florence soils).

<!--data category-->
For both datasets, we classified all the species into 
three distinct categories depending on their 
availability throughout the study period over all 
the quadrats considered. They are 
(i) "common species", which were present at least 35 years,
(ii) "rare species", which were present for a maximum of 2 years, and (iii) "intermediate species", which
were the remaining species. For tail association analysis,
we considered only the common species for each dataset (see 
Tables \ref{tab_spinfo_hays} and \ref{tab_spinfo_knz}) along with 
the other species merged into a single pseudo species. Fig. \ref{fig_spaceavg_timeseries} 
summarizes the species fluctuation over years for different 
categories from each of our two datasets.

```{r make_tab_common_splist_for_hays, echo=F, results="asis"}
library(kableExtra)
library(dplyr)
tab_spinfo_hays<-as.data.frame(matrix(NA,nrow=19,ncol=3))
colnames(tab_spinfo_hays)<-c("ID","Sp. code","Species name")
tab_spinfo_hays$ID<-c(1:19)
tab_spinfo_hays$`Sp. code`<-c(as.character(spinfo$PLANTS.Symbol[which(spinfo$species %in% names(hays_spaceavg$avg.basal.cover))]),
                         "pseudo_hays")
splistfullnm<-c(names(hays_spaceavg$avg.basal.cover)[1:18],"Intermediate + rare species")

tab_spinfo_hays$`Species name`<-c(splistfullnm)
knitr::kable(tab_spinfo_hays, 
             format="latex", align="c", linesep = "",
             caption.short="Species list for Hays data",
             caption = "`Common' species (see Methods) of the Hays dataset. The column `ID' is the species ID used in Fig. \\ref{fig_CorlmCoru}. The column `Sp. code' is a species code used in the downloaded, raw data. \\label{tab_spinfo_hays}", 
             booktabs = T,col.names = NULL) %>%
             column_spec(3,italic=T)%>%
             add_header_above(c("ID"=1,"Sp. code"=1,"Species name"=1))
```
<!--***DAN: Shyamolina, I added the text "The column 'ID' is the species ID used in Fig. \\ref{MT-fig_CorlmCoru}. The column 'Sp. code' is the species code used in the downloaded, raw data." Please verify these statements are correct.
***Shya: for Hays, Sp. Code refers to PLANTS.Symbol column of "species_list.csv" file which we downloaded as raw data (I wrote about this in Data Section line 876 of this Rmd.)
Fpr Konza, I combine genus and species column of raw data to get "Sp. Code", I got the full species name from a species list pdf file - this file is also downloadable from the link I added in my Datadescription section.-->

```{r make_tab_common_splist_for_knz, echo=F, results="asis"}
# species list for knz (PPS01) is downloadable from: http://lter.konza.ksu.edu/content/pps01-konza-prairie-plant-species-list
library(kableExtra)
library(dplyr)
tab_spinfo_knz<-as.data.frame(matrix(NA,nrow=22,ncol=3))
colnames(tab_spinfo_knz)<-c("ID","Sp. code","Species name")
tab_spinfo_knz$ID<-c(1:22)
tab_spinfo_knz$`Sp. code`<-names(knz_spaceavg$avg.percent.cover)
splistfullnm<-c("Ambrosia psilostachya", "Andropogon gerardii", "Asclepias verticillata","Asclepias viridis",
                "Baptisia bracteata", "Bouteloua curtipendula", "Brickellia eupatorioides",
                "Dalea candida","Dichanthelium oligosanthes", "Dichanthelium	ovale",
                "Lespedeza capitata",
                "Mimosa quadrivalvis", "Panicum virgatum", 
                "Ruellia humilis", "Salvia azurea", "Schizachyrium scoparium", "solidago	altissima", 
                "Sorghastrum nutans",
                "Sporobolus compositus", "Symphyotrichum ericoides", "Vernonia baldwinii", "Intermediate + rare species"
                )

tab_spinfo_knz$`Species name`<-c(splistfullnm)
knitr::kable(tab_spinfo_knz, 
             format="latex", align="c", linesep = "",
             caption.short="Species list for Konza",
             caption = "`Common' species (see Methods) of the Konza dataset. The column `ID' is the species ID used in Fig. \\ref{fig_CorlmCoru}. The column `Sp. code' is a species code used in the downloaded, raw data.\\label{tab_spinfo_knz}", 
             booktabs = T,col.names = NULL) %>%
             column_spec(3,italic=T)%>%
             add_header_above(c("ID"=1,"Sp. code"=1,"Species name"=1))
```

\begin{table}
\begin{center}
\caption[Summary of notation]{Summary of notation.}\label{fig_notation}
\begin{tabular}{p{1.5in}|p{5.5in}}
\hline
Notation & Meaning \\ \hline
$x_i(t)$ & species abundance, species indexed by $i=1,\ldots,N$, times by $t=1,\ldots,T$ \\
$\mu_i$ & $\mean(x_i)$ \\ 
$v_{ij}$ & $\cov(x_i,x_j)$, equals $\var(x_i)$ when $j=i$ \\
$m_{iii}$ & 3rd moment of $x_i$ \\
$\xtot$ & $\sum_i x_i(t)$ \\ \hline
$\CVcomsq$ & $\var(\xtot)/(\mean(\xtot))^2=$ community instability measured using variance \\
$\CVntasq$ & what $\CVcomsq$ would be without tail associations \\
$\CVindsq$ & $\left(\sum_i v_{ii}  \right)/\left( \sum_i \mu_i \right)^{2}=$ what $\CVcomsq$ would be without species interactions \\
$\CVsynsq$ & $\left( \sum_i \sqrt{v_{ii}} \right)^2/\left( \sum_i \mu_i \right)^2=$ what $\CVcomsq$ would be under perfect linear species associations \\
$\phicv$ &  $\CVcomsq/\CVindsq=\left( \sum_{i,j} v_{ij}  \right) / \left( \sum_i v_{ii}  \right)$ \\
$\phiLdM$ & $\CVcomsq/\CVsynsq = \left( \sum_{i,j} v_{ij}  \right) / \left( \sum_i \sqrt{v_{ii}}  \right)^2$ \\
$\phicvcor$ & $\CVntasq / \CVindsq$ \\
$\phicvta$ & $\CVcomsq / \CVntasq$ \\ \hline
$\Scom$ & $\sk(\xtot)$, where $\sk(\cdot)$ is skewness \\
$\Snta$ & what $\Scom$ would be without tail associations \\
$\Sind$ & $\left( \sum_i m_{iii} \right)/\left( \sum_i v_{ii} \right)^{3/2}=$ what $\Scom$ would be without species interactions \\
$\Ssyn$ & what $\Scom$ would be under perfect synchrony, see Discussion \\
$\phiS$ & $\Scom / \Sind$ \\
$\phiSLdM$ & $\Scom / \Ssyn$, see Discussion \\
$\phiScor$ & $\Snta/\Sind$ \\
$\phiSta$ & $\Scom/\Snta$ \\ \hline
$u_i(t)$ & the normalized rank of $x_i(t)$ \\
$b_l, b_u$ & lower and upper bounds used to delineate the band in the definition of the partial Spearman correlation \\
$\cor_{b_l,b_u}(x_i,x_j)$ & partial Spearman correlation of $x_i$ and $x_j$ \\
$\cor_l$ & $\cor_{b_l,b_u}$ using $b_l=0$ and $b_u=0.5$ \\
$cor_u$ & $\cor_{b_l,b_u}$ using $b_l=0.5$ and $b_u=1$ \\
$n_L$ & The number of values of $\cor_l(x_i,x_j)-\cor_u(x_i,x_j)$ for $i \neq j$ that were positive, representing stronger lower- than upper-tail association. Only pairs with positive total Spearman correlation were considered in the count. \\
$n_U$ & The number of values of $\cor_l(x_i,x_j)-\cor_u(x_i,x_j)$ for $i \neq j$ that were negative, representing stronger upper- than lower-tail association. Only pairs with positive total Spearman correlation were considered in the count. \\
$A_{\text{tot}}$ & the sum of $\cor_l(x_i,x_j)-\cor_u(x_i,x_j)$ across all positively associated species pairs, $i \neq j$ \\
\hline
\end{tabular}
\end{center}
\end{table}

```{r further_discussion_table, echo=F, results="asis"}
library(tibble)
h<-c("r skew $\\uparrow$","l skew $\\uparrow$","skew $\\bullet$","skew $\\bullet$","r skew $\\downarrow$","l skew $\\downarrow$","r to l skew","l to r skew")
tabres<-tibble("$\\phicv$"=c(rep("$\\approx 1$",8),rep("$>1$",8),rep("$<1$",8)),
                   "$\\phi_S$"=rep(c("$>1$","$>1$","$\\approx 1$","$\\approx 1$","$<1$, $\\geq 0$","$<1$, $\\geq 0$","$<0$","$<0$"),times=3),
                   "$\\Sind$"=rep(c("$>0$","$<0$"),times=12),
                   "$\\Scom$"=rep(c(rep(c("$>0$","$<0$"),times=3),"$<0$","$>0$"),times=3),
                   "Interpretation, what species relationships do to variance and skewness of $\\xtot$"=
                      c(paste("variance $\\bullet$;",h),
                        paste("variance $\\uparrow$;",h),
                        paste("variance $\\downarrow$;",h)),
                   "Interpretation, what species relationships do to extreme low values of $\\xtot$"=
                      c("Less extreme",
                        "More extreme",
                        "About the same",
                        "About the same",
                        "More extreme",
                        "Less extreme",
                        "More extreme",
                        "Less extreme",
                        "??",
                        "More extreme",
                        "More extreme",
                        "More extreme",
                        "More extreme",
                        "??",
                        "More extreme",
                        "??",
                        "Less extreme",
                        "??",
                        "Less extreme",
                        "Less extreme",
                        "??",
                        "Less extreme",
                        "??",
                        "Less extreme"),
                   "Interpretation, what species relationships do to extreme high values of $\\xtot$"=
                      c("More extreme",
                        "Less extreme",
                        "About the same",
                        "About the same",
                        "Less extreme",
                        "More extreme",
                        "Less extreme",
                        "More extreme",
                        "More extreme",
                        "??",
                        "More extreme",
                        "More extreme",
                        "??",
                        "More extreme",
                        "??",
                        "More extreme",
                        "??",
                        "Less extreme",
                        "Less extreme",
                        "Less extreme",
                        "Less extreme",
                        "??",
                        "Less extreme",
                        "??"))
colnames(tabres)<-c("$\\phicv$",
                   "$\\phi_S$",
                   "$\\Sind$",
                   "$\\Scom$",
                   "Interpretation, what species relationships do to variance and skewness of $\\xtot$",
                   "Interpretation, what species relationships do to extreme low values of $\\xtot$",
                   "Interpretation, what species relationships do to extreme high values of $\\xtot$")
knitr::kable(tabres, format="latex", booktabs = T, linesep = "", escape=F,
             caption.short="Further interpretations of $\\phicv$ and $\\phi_S$",
             caption = "Further interpretations of what different values of $\\phicv$ and $\\phi_S$ mean
             for the effects of species relationships on the distribution of $\\xtot$. The first four columns
             contain scenarios for $\\phicv$, $\\phi_S$, $\\Sind$, and $\\Scom$. Column 5 indicates the effects 
             species relationships will therefore have on the variance and skewness of the $\\xtot$ distribution, 
             with $\\uparrow$ indicating an increase, $\\downarrow$ indicating a decrease, and $\\bullet$ indicating
             no change. The last two columns indicate the effects species relationships are expected to have on the
             extreme left and right tails of the $\\xtot$ distribution, either making those values more extreme 
             or making them less extreme. An entry `??' in one of the last two columns indicates the effects of 
             species relationships on variance and skewness suggest opposing trends for the corresponding tail 
             of $\\xtot$, so results will depend on the relative strengths of these effects. \\label{further_discussion_table}")%>%
     column_spec(column = 1,width=".9 cm")%>%
     column_spec(column = 2,width="1.8 cm")%>%
     column_spec(column = 3,width=".9 cm")%>%
     column_spec(column = 4,width=".9 cm")%>%
     column_spec(column = 5,width="4 cm")%>%
     column_spec(column = 6,width="3 cm")%>%
     column_spec(column = 7,width="3 cm")
```

# Details of the statistical methods \label{stats_details}

Our theory predicted that tail associations between positively associated pairs of species (synchronous
species) should influence the skew of the total community, $\xtot$, so this is what we investigated
empirically. We ignored pairs of species which were negatively associated (pairs of species exhibiting compensatory
dynamics). However, compensatory dynamics is known to help reduce the 
variance of $\xtot$. Tail associations between species exhibiting compensatory dynamics may influence the 
skewness of $\xtot$, or another aspect of its distribution, so the presence/absence of tail associations
in compensatory pairs of species should be studied in future work. Modified methods will be needed; for instance,
the terminology "lower-tail association" and "upper-tail association" does not make sense in the context
of negatively associated variables, since the upper tail of one variable is associated with the lower tail
of the other.

The skewness of $\xtot$ is the third central moment, 
$m_3(\xtot)$, divided by the cubed standard deviation of $\xtot$. The standard
unbiased estimator (with Bessel's correction) was used for the standard deviation. For the third central moment, we used 
the third $h$-statistic, $m_3(\xtot)=\frac{T}{(T-2)(T-1)}\sum_{t=1}^T (\xtot(t)-\mean(\xtot))^3$, which is the 
unique symmetric unbiased estimator of that quantity. 

To see that the quantity $\Sind=\frac{\sum_i m_3(x_i)}{(\sum_i v_{ii})^{3/2}}$ defined in the main text is
the value that $\Scom$ would take if species dynamics were independent, we use the definiton of skew to write 
$\sk(\xtot)=\frac{m_3(\xtot)}{\var(\xtot)^{3/2}}=\frac{\sum_{i,j,k}m_{ijk}}{(\sum_{i,j}v_{ij})^{3/2}}$,
where $m_{ijk}=\frac{T}{(T-2)(T-1)}\sum_{t=1}^T (x_i(t)-\mean(x_i))(x_j(t)-\mean(x_j))(x_k(t)-\mean(x_k))$ is a third
cross-moment. Assuming independence of species dynamics, this becomes $\frac{\sum_i m_{iii}}{(\sum_i v_{ii})^{3/2}}=
\frac{\sum_i m_3(x_i)}{(\sum_i v_{ii})^{3/2}}=\Sind$.

#Additional interpretations of $\phi_S$ \label{phiSinterp}

In Methods in the main text, we presented interpretations of the meaning of different values of $\phi_S$ under a 
simplifying assumption that $\phicv \approx 1$. We here discuss interpretations when $\phicv$ is not 
close to $1$. The two governing equations are $\CVcomsq=\phicv \CVindsq$ and $\Scom=\phi_S \Sind$, and 
these equations describe two effects. 
The first of these equations/effects means that, when $\phicv>1$, the variance of the distribution $\xtot$ is 
magnified by species relationships; whereas when $\phicv<1$, the variance of $\xtot$ is mitigated by 
species relationships. To interpret the second equation/effect, we consider the separate cases $\Scom>0$ 
and $\Scom<0$. If $\Scom>0$, corresponding to right skew in the $\xtot$ distribution, then $\phi_S>1$ 
means that right skew was accentuated by species relationships, i.e., some of the observed right skew 
in $\xtot$ was due to species relationships. Whereas $0<\phi_S<1$ means right skew in the $\xtot$ 
distribution was mitigated by species relationships, i.e., right skew would have been stronger were it 
not for species relationships. If $\phi_S<0$, it means species relationships are fully responsible for 
the right skew of the $\xtot$ distribution, i.e., were it not for species relationships, the 
distribution $\xtot$ would have been left skewed instead. If $\Scom<0$, corresponding to left skew in 
the $\xtot$ distribution, then $\phi_S>1$ means that left skew was accentuated by species relationships, 
i.e., some of the observed left skew in $\xtot$ was due to species relationships. Whereas $0<\phi_S<1$ 
means left skew in the $\xtot$ distribution was mitigated by species relationships, i.e., left skew 
would have been stronger were it not for species relationships. If $\phi_S<0$, it means species 
relationships are fully responsible for the left skew of the $\xtot$ distribution, i.e., were it 
not for species relationships, the distribution $\xtot$ would have been right skewed instead. 

What does this mean about the influence of species relationships on extreme high and low 
values of the $\xtot$ distribution? 
Table \ref{further_discussion_table} enumerates possibilities. The two effects described above 
sometimes act together and sometimes act in opposition to influence extreme portions of the 
$\xtot$ distribution. The variance effect (described first in the previous paragraph, and linked to 
the equation $\CVcomsq=\phicv \CVindsq$) tends to render more extreme both extreme low and high 
portions of the distribution, when $\phicv>1$; or tends to render them both less extreme,
when $\phicv<1$. The skewness effect (linked to the equation $\Scom=\phi_S \Sind$) tends 
to render one tail more extreme and the other less extreme. For instance, if $\phicv>1$ 
and $\phi_S>1$ (rows 8 and 9 of Table \ref{further_discussion_table}) and if $\Sind$ and $\Scom$
are positive (row 8 of Table \ref{further_discussion_table}), species relationships 
increase the variance of $\xtot$ and render it more right skewed. In the right tail of 
the $\xtot$ distribution, these effects reinforce each other to render high values of
$\xtot$ more extreme than they would be in the absence of species relationships. But in the
left tail of the $\xtot$ distribution, the variance effect tends to render values more extreme
and the skewness effects tends to render them less extreme. The net outcome of these two
influences depends on their relative strength. 

There is a mathematical relationship between $\phi_S$ and $\phicv$. We have 
\begin{align}
\phi_S &= \frac{\Scom}{\Sind} \\
&= \frac{m_3(\xtot)}{\var(\xtot)^{3/2}} \frac{(\sum_i v_{ii})^{3/2}}{\sum_i m_{iii}} \label{eq_int_2} \\
&= \left( \frac{m_3(\xtot)}{\sum_i m_{iii}} \right) \left( \frac{1}{\phicv} \right)^{3/2},
\end{align}
\noindent where (\ref{eq_int_2}) follows from the definitions and results in SI section \ref{stats_details}.
While this may be a useful relationship for future work, it is neither as straightforward as it
may seem, nor does it render superfluous our new $\phi_S$ metric by relating it the classic
variance ratio. The reason is, the factor $\frac{m_3(\xtot)}{\sum_i m_{iii}}$ does not necessarily
vary independently of $\phicv$ and $\phi_S$, and the biological interpretation of 
$\frac{m_3(\xtot)}{\sum_i m_{iii}}$ is not immediately obvious, at least not to us.
It may be useful, in future work, to calculate $\phicv$, $\phi_S$ and $\frac{m_3(\xtot)}{\sum_i m_{iii}}$
for a large number and variety of community datasets and to explore empirically how $\phicv$ and 
$\phi_S$ are related across ecosystems.

# Surrogate datasets  \label{surrogs} 

We first outline the properties of our surrogate datasets. We then describe the surrogate 
algorithm, starting with bivariate data, for simplicity, and then moving on to describing the algorithm
for multivariate data. 

The surrogate datasets, $x_i^{(m)}(t)$, have the following properties. 
First, for each $m$ and $i$, the sets $\{x_i(t) : t=1,\ldots,T\}$ and 
$\{x_i^{(m)}(t) : t=1,\ldots,T\}$ are identical, as unordered sets.
In particular this implies that the temporal means and variances of the time series
$x_i(t)$ and $x_i^{(m)}(t)$ are identical, for all $i$ and $m$. 
Second, the expected value of the sum $\sum_{i_2>i_1} \cov(x_{i_1}^{(m)}(t),x_{i_2}^{(m)}(t))$
is within $\pm 10\%$ of the sum $\sum_{i_2>i_1} \cov(x_{i_1}(t),x_{i_2}(t))$;
and for any $i_1$ and $i_2$, the expected value of
$\cor(x_{i_1}^{(m)}(t),x_{i_2}^{(m)}(t))$ is as close as possible to
$\cor(x_{i_1}(t),x_{i_2}(t))$ (see below for information on the extent to which
these quantities agreed for our datasets).
In particular, this means that the total community variance, 
$\var(\sum_i x_i^{(m)}(t))= \sum_{i_1, i_2} \cov(x_{i_1}^{(m)}(t),x_{i_2}^{(m)}(t))$
for a surrogate dataset should be similar to the empirical total community variance,
$\var(\sum_i x_i(t))= \sum_{i_1, i_2} \cov(x_{i_1}(t),x_{i_2}(t))$.
Likewise,
the variance ratios $\frac{\sum_{i_1, i_2} \cov(x_{i_1}^{(m)}(t),x_{i_2}^{(m)}(t))}
{\sum_{i} \var(x_i^{(m)}(t))}$ and 
$\frac{\sum_{i_1, i_2} \cov(x_{i_1}(t),x_{i_2}(t))}
{\sum_{i} \var(x_i(t))}$ should be similar.
Finally, for any $i_1$, $i_2$ and $m$, the tail associations of $x_{i_1}^{(m)}(t)$
and $x_{i_2}^{(m)}(t)$ are symmetric, i.e., similar in their upper- and lower-tail association
values. 

Given data $(x(t),y(t))$ for $t=1,\ldots,T$, the bivariate
surrogates algorithm will provide surrogate datasets
$(x^{(m)}(t),y^{(m)}(t))$ for $t=1,\ldots,T$ with the following properties.
First, for each $m$,
the sets $\{x(t) : t=1,\ldots,T\}$ and $\{x^{(m)}(t) : t=1,\ldots,T\}$
are identical, as unordered sets, as are the sets 
$\{y(t) : t=1,\ldots,T\}$ and $\{y^{(m)}(t) : t=1,\ldots,T\}$. Thus,
in particular, $\mean(x(t))=\mean(x^{(m)}(t))$ and 
$\mean(y(t))=\mean(y^{(m)}(t))$ for all $m$, where these means are computed over time. Likewise
$\var(x(t))=\var(x^{(m)}(t))$ and 
$\var(y(t))=\var(y^{(m)}(t))$ for all $m$. Similarly for
higher moments or other quantities that depend only on the time series
univariate marginal distributions.
Second, the Pearson correlations $\cor(x(t),y(t))$
and $\cor(x^{(m)}(t),y^{(m)}(t))$ are approximately equal (differing 
only due to a type of sampling variation), where these correlations are computed over time. 
Finally, the copula 
structure of the $(x^{(m)}(t),y^{(m)}(t))$ for $t=1,\ldots,T$
will be normal. See @Ghosh_copula for a definition of "copula" and background, as well as 
extensive references to additional literature. Copula statistics is a whole subfield of 
statistics which we do not review here because we already reviewed the basics in
our prior work [@Ghosh_copula], and many basic and advanced texts on the subject 
are available [@joe2014_dependence;@nelsen2006_copula;@MaiScherer2017].
Note also that the covariance
$\cov(x^{(m)}(t),y^{(m)}(t))$ will be approximately equal to the covariance
$\cov(x(t),y(t))$ (up to sampling variation), where these covariances are again computed over time, because 
$\cor(x^{(m)}(t),y^{(m)}(t))$ is approximately equal to 
$\cor(x(t),y(t))$ and $\var(x(t))=\var(x^{(m)}(t))$ and 
$\var(y(t))=\var(y^{(m)}(t))$.

To construct surrogates, we begin by defining a stochastic map
$$\varphi:[-1,1]\rightarrow[-1,1]$$ as follows. Given $p \in [-1,1]$,
consider $p$ to be the covariance of a bivariate normal distribution
with standard normal univariate marginals. Generate data 
$(a(t),b(t))$ for $t=1,\ldots,T$ via independent draws 
from this distribution. Permute the ordered set $(x(1),\ldots,x(T))$
so that, for all $k$, the time series position of the $k$th-largest element
in the permuted set is the same as the time series position of the 
$k$th-largest element of the ordered set $(a(1),\ldots,a(T))$. We refer to
such a permutation operation as \emph{aligning the ranks} of the $x$
to match those of the $a$. Aligning the ranks is not affected by possible ties in
the set $\{x(1),\ldots,x(T)\}$, and, with probability 1, there are no ties in 
the set $\{a(1),\ldots,a(T)\}$. Likewise align the ranks of the $y$
to match those of the $b$. Another way to describe this rank alignment 
step for $x$ is to select a permutation $\sigma_x$
of the indices $1,\dots,T$ such that the time series
$(x(\sigma_x(1)),\ldots,x(\sigma_x(T)))$ has 
$\rank(x(\sigma_x(t)))=\rank(a(t))$ for all $t$, where $\rank(x(\sigma_x(t)))$ is
the rank of $x(\sigma_x(t))$ in the set $\{x(1),\ldots,x(T)\}$ and 
$\rank(a(t))$ is the rank of $a(t)$ in the set $\{a(1),\ldots,a(T)\}$.
We consider the smallest element of a set of size $T$ to have rank $1$ and
the largest element to have rank $T$. 
The value of $\rank(a(t))$ is well defined,
with probability 1, because, with probability 1, there are no ties in 
$\{a(1),\ldots,a(T)\}$. 
The value of $\rank(x(\sigma_x(t)))$ is defined by randomly ranking tied
elements, e.g., the ranks of the elements of the set $\{5, 4.2, 4.2\}$
can randomly be considered to be $\{3,1,2\}$ or $\{3,2,1\}$, with no effect
on the rank alignment algorithm.
We likewise select a permutation $\sigma_y$
such that the time series $(y(\sigma_y(1)),\ldots,y(\sigma_y(T)))$ has 
$\rank(y(\sigma_y(t)))=\rank(b(t))$ for all $t$. We then define $\varphi(p)$
to be the Pearson correlation of the permuted time series, i.e.,
$\cor(x(\sigma_x(t)),y(\sigma_y(t)))$. This is a 
stochastic quantity, because it is based on the stochastically generated
time series $(a(t),b(t))$. We also define the expected value map
$\E\varphi(p)$. In practice, this is computed numerically by computing
the stocastic map $\varphi(p)$ many times and taking the mean. Thus
$\E\varphi(p)$ can be determined with error that can be reduced 
arbitrarily through additional stochastic evaluations of $\varphi(p)$.

Having defined $\varphi$, let $c=\cor(x(t),y(t))$ be the Pearson 
correlation of the data, and select $\hat{p} \in [-1,1]$ such that $\E\varphi(\hat{p})=c$ 
to within the desired precision. See below on how to do this, and our experience with
when it is possible, and whether and when $\hat{p}$ is unique. Then Pearson-preserving 
surrogates are constructed for
the dataset $(x(t),y(t))$, $t=1,\ldots,T$, as follows. First, generate data 
$(a(t),b(t))$ for $t=1,\ldots,T$ via independent draws from the bivariate normal
distribution with standard normal marginals and covariance $\hat{p}$.
Then align the ranks of $x(t)$ to match those of $a(t)$, and align the ranks
of $y(t)$ to match those of $b(t)$. These permutations of $x(t)$ and $y(t)$, together, 
form the surrogates.
Numerous surrogate datasets can be generated by repeating this process.
The first two desired properties of the surrogates listed above are 
satisfied, by construction. The final desired property of the surrogates
(normal copula structure) is satisfied because the ranks of the surrogates
are the same as the ranks of $a$ and $b$ [@Ghosh_copula;@nelsen2006_copula]. 

Given data $(x(t),y(t))$ for $t=1,\ldots,T$ and given $c \in [-1,1]$,
we now discuss what we know about whether and when there exists $\hat{p} \in [-1,1]$ such 
that $\E\varphi(\hat{p})=c$, and whether $\hat{p}$ is unique when it 
exists; we simultaneously discuss how we obtained $\hat{p}$ for our data.
Above we used $c=\cor(x(t),y(t))$, but here we consider general $c \in [-1,1]$. 
Given $(x(t),y(t))$, we evaluated $\varphi$ $500$ times for $p$ equal to each
of $18$ values evenly spaced from $-1$ to $1$, and plotted the results (e.g., Fig. 
\ref{fig:varphiexample} shows an example plot). We generated such a plot
whenever we made use of the map $\varphi$ for a dataset $(x(t),y(t))$.
In every case we checked whether the piecewise-linear approximation 
to the expected-value map $\E\varphi$ (depicted on Fig. \ref{fig:varphiexample} 
for this example) was monotonic, finding that in every case it was. We then
used this piecewise-linear approximation as a standin for the map
$\E\varphi$ in subsequent computations. Whenever $c$ was between the values
$\E\varphi(-1)$ and $\E\varphi(1)$, depicted on Fig. \ref{fig:varphiexample}
as the extreme y-axis values obtained, the conditions checked above meant we could
find a single unique pre-image of $c$ under the map $\E\varphi$. This was 
an approximate pre-image because we used the piecewise-linear approximation for 
$\E\varphi$, but was likely to be a good approximation because we used $500$
evaluations of $\varphi$ for each value of $p$.
Henceforth we will denote by $\E\varphi^{-1}$ the inverse of the piecewise-linear
approximation of $\E\varphi$; this piecewise-linear approximation was always invertible
thanks to the checks performed above. The extreme value 
$\E\varphi(1)$ can be straightforwardly
computed by sorting both $x$ and $y$ into ascending order and computing the Pearson correlation.
The extreme value $\E\varphi(-1)$ can be straightforwardly
computed by sorting $x$ into ascending order and $y$ into descending order, and
then computing the Pearson correlation.

Given multivariate data, $(x_1(t),\ldots,x_N(t))$ for $t=1,\ldots,T$,
we now move on to describing surrogates for these data, 
$(x_1^{(m)}(t),\ldots,x_N^{(m)}(t))$ for $t=1,\ldots,T$. Multivariate surrogates 
have the following properties, the second of which is slightly weaker than the 
corresponding property for the bivariate case.
First, for each $m$ and $i$, the sets $\{x_i(t) : t=1,\ldots,T\}$ and 
$\{x_i^{(m)}(t) : t=1,\ldots,T\}$ are identical, as unordered sets. 
Second, the expected value of the sum $\sum_{i_2>i_1} \cov(x_{i_1}^{(m)}(t),x_{i_2}^{(m)}(t))$
is within $\pm 10\%$ of the sum $\sum_{i_2>i_1} \cov(x_{i_1}(t),x_{i_2}(t))$;
and for any $i_1$ and $i_2$, the expected value of
$\cor(x_{i_1}^{(m)}(t),x_{i_2}^{(m)}(t))$ is as close as possible to
$\cor(x_{i_1}(t),x_{i_2}(t))$ (below, we will make precise
the degree to which these quantities agreed for our datasets).
Finally, the copula structure of $(x_1^{(m)}(t),\ldots,x_N^{(m)}(t))$
for $t=1,\ldots,T$ will be multivariate normal. Because
$\sum_{i_2>i_1} \cov(x_{i_1}^{(m)}(t),x_{i_2}^{(m)}(t))$
is within $\pm 10\%$ of $\sum_{i_2>i_1} \cov(x_{i_1}(t),x_{i_2}(t))$,
the total community variance $\var(\sum_i x_i^{(m)}(t))= \sum_{i_1, i_2} \cov(x_{i_1}^{(m)}(t),x_{i_2}^{(m)}(t))$
for a surrogate dataset should be similar to the total community variance,
$\var(\sum_i x_i(t))= \sum_{i_1, i_2} \cov(x_{i_1}(t),x_{i_2}(t))$, for the real data.
Likewise,
the variance ratios $\frac{\sum_{i_1, i_2} \cov(x_{i_1}^{(m)}(t),x_{i_2}^{(m)}(t))}
{\sum_{i} \var(x_i^{(m)}(t))}$ and 
$\frac{\sum_{i_1, i_2} \cov(x_{i_1}(t),x_{i_2}(t))}
{\sum_{i} \var(x_i(t))}$ should be similar.

We begin by defining the map $\varphi_{i_1 i_2}:[-1,1]\rightarrow[-1,1]$
for each pair of taxa $i_1<i_2$, using the data $(x_{i_1}(t),x_{i_2}(t))$ for $t=1,\ldots,T$;
this is done using the same procedure described in the bivariate case.
We then construct $\hat{p}_{i_1 i_2} = \E\varphi_{i_1 i_2}^{-1}(c_{i_1 i_2})$ for 
$c_{i_1 i_2} = \cor(x_{i_1}(t),x_{i_2}(t))$.
For the datasets we considered, this was uniquely defined for all $i_1$ and $i_2$.
We then construct a symmetric, $N \times N$ matrix, $\hat{P}$, with $1$s on the diagonal, based on the 
values $\hat{p}_{i_1 i_2}$. If this were a positive-definite matrix, a surrogate 
dataset $(x_1^{(m)}(t),\ldots,x_N^{(m)}(t))$ for $t=1,\ldots,T$ would 
then be obtained by the following steps. First, generate data 
$(a_1(t),\ldots,a_N(t))$ for $t=1,\ldots,T$ via independent draws from the multivariate normal
distribution with standard normal marginals and covariance matrix $\hat{P}$
(covariance matrices must be positive definite, so this is where positive definiteness is needed).
Then, for each $i$, align the ranks of $x_i(t)$ to match those of $a_i(t)$.
These permutations of the time series $x_i(t)$, together, form the surrogate dataset.
Numerous surrogate datasets can be generated by repeating this process.
Following reasoning similar to the bivariate case, it is straightforward
to see that the desired properties of the surrogate dataset, listed above, are
satisfied. In fact, the Pearson correlations 
$\cor(x_{i_1}^{(m)}(t),x_{i_2}^{(m)}(t))$ should be approximately equal
to the values $\cor(x_{i_1}(t),x_{i_2}(t))$ (up to sampling variation) 
instead of merely being close. And
the expected value of the sum $\sum_{i_2>i_1} \cov(x_{i_1}^{(m)}(t),x_{i_2}^{(m)}(t))$
should therefore also approximately equal the sum $\sum_{i_2>i_1} \cov(x_{i_1}(t),x_{i_2}(t))$,
rather than merely being within $10\%$. However, the matrix $\hat{P}$ was not 
positive definite for our datasets, 
so we proceeded instead as follows. 

We defined an objective function $f(c)$, where $c$ is a vector consisting of 
values between $-1$ and $1$ indexed by
$i_1$ and $i_2$ where $1 \leq i_1 < i_2 \leq N$. For a given $c$, $f$ was
computed by first computing $p_{i_1 i_2}=\E\varphi_{i_1 i_2}^{-1}(c_{i_1 i_2})$
for all $1 \leq i_1 < i_2 \leq N$; then by forming an $N \times N$ symmetric 
matrix $P$, with ones on the diagonal, from the $p_{i_1 i_2}$; then by computing
the minimum eigenvalue of that matrix. Using the Nelder-Mead simplex algorithm, the 
objective function $f$ was maximized subject to the constraints that: 
$c_{i_1 i_2}$ must be in the domain of $\E\varphi_{i_1 i_2}^{-1}$; and
$\sum_{i_2>i_1} c_{i_1 i_2} \sqrt{\var(x_{i_1}(t)) \var_t(x_{i_2}(t))}$ must be 
within $10\%$ of $\sum_{i_2>i_1} \cov(x_{i_1}(t),x_{i_2}(t))$.
The Nelder-Mead algorithm was not run to completion, but rather was terminated 
after the first value of $c$ was dicovered for which $f(c)$ was greater than $0$;
we denote that first value of $c$ by $\hat{c}$. We then defined 
$\hat{p}_{i_1 i_2}=\E\varphi_{i_1 i_2}^{-1}(\hat{c}_{i_1 i_2})$
for all $1 \leq i_1 < i_2 \leq N$ and we formed the $N \times N$ symmetric matrix 
$\hat{P}$ from these values, putting $1$s on the diagonal. This was a 
positive definite matrix by construction. Surrogates were constructed as 
described in the previous paragraph but using this matrix $\hat{P}$.

With two multivariate species cover datasets (Hays, $N=19$ sp., and Konza, $N=22$ sp., both of these species counts 
including the single pseudo-species consisting of "rare" and "intermediate" species combined, see Methods in 
the main text), we checked whether the 
surrogates actually had the properties outlined above. First, using 
each of 10000 surrrogates indexed by $m=1, 2, \ldots, 10000$, we computed 
$\sum_{i_2>i_1} \cov(x_{i_1}^{(m)}(t),x_{i_2}^{(m)}(t))$ 
and compared the resulting distribution of values to the single quantity 
$\sum_{i_2>i_1} \cov(x_{i_1}(t),x_{i_2}(t))$ (Fig. \ref{fig:check_cov_var_vr}A, B). 
The values for the data were very close to the middle of the distributions of surrogate
values, as desired. 
Second, we made similar 
comparisons (Fig. \ref{fig:check_cov_var_vr}C, D) using the quantities 
$\var(\sum_i x_i^{(m)}(t))$ and $\var(\sum_i x_i(t))$. Again, and by mathematical necessity
given the results of Fig. \ref{fig:check_cov_var_vr}A, B,
the values for the data were very close to the middle of the distributions of surrogate
values, as desired. Third, we made similar comparisons using the variance ratio,
i.e., using the quantities  $\var(\sum_i x_i^{(m)}(t))/ \sum_i(\var x_i^{(m)}(t))$
and $\var(\sum_i x_i(t))/ \sum_i(\var x_i(t))$ (Fig. \ref{fig:check_cov_var_vr}E, F). 
Again, and by mathematical necessity
given the results from Fig. \ref{fig:check_cov_var_vr}A, B,
the values for the data were very close to the middle of the distributions of surrogate
values, as desired. 
Finally, to see how 
similar Pearson correlations from the surrogates were to those of the data for each pair of species, 
we made a scatter plot with the Pearson correlations from actual data plotted against the same quantities
for the surrogates, for each pair of distinct species (Fig. \ref{fig:check_pairwisecor}). 
For most pairs of species, the 2.5th and 97.5th quantiles of the surrogate values 
were on opposite sides of the data value.

# An empirical cdf approach \label{ecdf_approach}

The empirical cumulative distribution function (ECDF) associated with a sample $y_j$, $j=1,\ldots,L$,
from a random variable $Y$ is the function $\hat{F}: \mathbb{R} \rightarrow [0,1]$ such that
$\hat{F}(y)$ equals the number of elements of the set $\{y_1,y_2,\ldots,y_L\}$ which are less than
or equal to $y$, divided by $L$. The ECDF approximates the CDF of $Y$ for large $L$, and hence 
$\hat{F}(y)$ approximates the probability $P[Y \leq y]$. We here
use the EDCF of $\xtot$ to further illuminate some of the claims made in the main text about the 
influence of species relationships on the distribution of $\xtot$ and on probabilities that 
$\xtot$ exceeds high disaster thresholds or falls below low ones.
Henceforth letting $\hat{F}(x)$ denote the ECDF of the sample $\xtot(t)$, $t=1,\ldots,T$, we can interpret 
$\hat{F}(x)$ as an approximation of the probability that $\xtot$ will fall below $x$ for a low 
disaster threshold, $x$. If $x$ is instead a high disater threshold, then $1-\hat{F}(x)$
approximates the probability that $\xtot$ will exceed $x$. 

Denoting, as in the main text, our species abundance data as 
$x_i(t)$ for species $i=1,\ldots,N$ and times $t=1,\ldots,T$, and also again denoting $\xtot(t)=\sum_i x_i(t)$,
we computed the ECDF $\hat{F}(x)$ of $\xtot$, and compared it to ECDFs $\hat{F}^{(m)}(x)$
based on surrogate datasets $x_i^{(m)}(t)$ for $m=1,\ldots,M$. We again used $M=10000$.
The surrogates described in the main text and in SI section \ref{surrogs} were used, but we 
also, in separate computations, used independent surrogates $x_i^{(\text{ind},m)}(t)$
obtained by randomly and independently permuting the species time series, $x_i(t)$. The purpose, here,
was to compare the ECDF of data, $\hat{F}(x)$, to two null hypotheses, corresponding to the two different types
of surrogates used: one based on a null assumption of symmetric tail associations, but unchanged 
overall correlations between species; and the other based on independent species dynamics. 

The ECDF of data was
displayed by plotting the values of $\hat{F}(x)$ against $x$ for $x$ equal to each of the values $\xtot$.
To compare to the surrogate ECDFs, for each of the possible vertical axis values $y=1/T,2/T,\ldots,(T-1)/T,T/T=1$ 
that our ECDFs could take, 
and for each $m=1,\ldots,M$, the $x$ value was computed at which $\hat{F}^{(m)}(x)$ first
reached the value $y$. For each of $y=1/T,2/T,\ldots,(T-1)/T,1$,
the resulting distribution of $M=10000$ $x$ values was then displayed and compared to the $x$ value
at which $\hat{F}(x)$ first reached $y$. Fig. \ref{hays_ecdf} shows results for Hays
and Fig. \ref{konz_ecdf} shows results for Konza. Panels (A) of these plots show comparisons with
symmetric tail association surrogates, and panels (B) show comparisons with independent surrogates. 

If a red point near the top of one of the panels of Fig. \ref{hays_ecdf} or \ref{konz_ecdf}
is in the right tail of the surrogate distribution, it indicates that $\xtot$ values for data reached higher 
upper extremes than expected from the corresponding surrogate null hypothesis. In other words, asymmetric tail associations
(panels A) or species relationships, generally (panels B) significantly destabilized (made more extreme) the upper tails of 
$\xtot$. If, on the other hand, the red point is instead in the left tail of the surrogate distribution, it indicates that 
the upper extremes of the $\xtot$ distribution for data were lower than expected from the corresponding null hypothesis. 
In other words, asymmetric tail associations (panels A) or species relationships, generally (panels B) significantly 
stabilized (made less extreme) the upper tails of $\xtot$. 

Analogously, if a red point near the bottom 
of one of the panels of Fig. \ref{hays_ecdf} or \ref{konz_ecdf}
is in the left tail of the surrogate distribution, it indicates that $\xtot$ values for data reached smaller 
lower extremes than expected from the corresponding surrogate null hypothesis. In other words, asymmetric tail associations
(panels A) or species relationships, generally (panels B) significantly destabilized (made more extreme) the lower tails of 
$\xtot$. If, on the other hand, the red point is instead in the right tail of the surrogate distribution, it indicates that 
the lower extremes of the $\xtot$ distribution for data were not as low as expected from the corresponding null hypothesis. 
In other words, asymmetric tail associations (panels A) or species relationships, generally (panels B) significantly 
stabilized (made less extreme) the lower tails of $\xtot$. 

In this way Figs \ref{hays_ecdf} or \ref{konz_ecdf}
reveal possible stabilizing tendencies of species relationships, acting separately on the very large or small
values of $\xtot$. Thus Figs \ref{hays_ecdf} or \ref{konz_ecdf} separately reveal whether species relationships mitigate 
probabilities of $\xtot$ exceeding a high disaster threshold or fall below a low disaster thresholds. 

Applying these techniques to Hays, we see that the highest values of $\xtot$ are significantly less extreme than 
would be expected given symmetric tail associations between species (top part of Fig. \ref{hays_ecdf}A), and are
significantly and even more substantially less extreme than would be expected given independent species dynamics
(top part of Fig. \ref{hays_ecdf}B). Both correlations and asymmetry of tail associations between species in
Hays apparently mitigate explosions of $\xtot$ to high values in the system, and reduce the probability of $\xtot$ exceeding
high disaster thresholds. Now looking at low values of $\xtot$, the bottom part of Fig. \ref{hays_ecdf}A shows that
low values of $\xtot$ were significantly lower (more extreme) or not significantly different from what
would be expected given symmetric
tail associations between species. However, they were higher (less extreme) than would be expected given independent
species dynamics (bottom part of Fig. \ref{hays_ecdf}B). So species relationships mitigated the severity of 
low extremes of $\xtot$, and this was due to aspects of species relationships other than tail associations.

Applying our techniques to Konza, we see that some of the highest values of $\xtot$ are more extreme than would be expected given
symmetric tail associations between species (top part of Fig. \ref{konz_ecdf}A), and are more extreme than would
be expected given independent species dynamics (top part of Fig. \ref{konz_ecdf}B). Looking at low values of $\xtot$,
the bottom part of Fig. \ref{konz_ecdf}A shows that low values of $\xtot$ were significantly higher (less extreme)
from what would have been expected given symmetric tail associations. Likewise they were higher (less extreme)
than would be expected given independent species dynamics (bottom part of Fig. \ref{konz_ecdf}B).

<!--Larger version of panels A and B from the example fig first cited in the Intro-->
\begin{figure}
\begin{center}
\includegraphics[width=\textwidth]{./Results/pedagog_figs/pedagog_cv2_skw_SM.pdf}
\caption[Larger version of Fig. \ref{MT-fig_pedag_cv2_skw} from the main text, panels A and B.]{Larger version of Fig. \ref{MT-fig_pedag_cv2_skw} from the main text, panels A and B. \label{fig_pedag_cv2_skw_SM}}
\end{center}
\end{figure}

<!--Theory figure, extended-->
\begin{figure}
\begin{center}
\includegraphics[height=.9\textheight]{./Results/TheoryFig.pdf}
\caption[Extended version of Fig. \ref{MT-fig_theory} of the main text]{Extended version of Fig. \ref{MT-fig_theory} 
of the main text. Dashed black lines on panels F, J, N R are copied from the solid black line on panel B, to facilitate 
comparison. Likewise dashed black lines on panels G, K, O and S are copied from C; black lines on H, L, P and T are 
copied from D; dashed red lines on K, O and S are copied from G; and dashed red lines on L, P and T are copied from 
H. See the caption of that figure for additional details.\label{fig_theory_SM}}
\end{center}
\end{figure}

<!--hays & knz: time-series results plot-->
\begin{figure}
\begin{center}
\textbf{ \hspace{0.5 cm} (A) \hspace{5.5 cm} (B)} \\
\includegraphics[width=6 cm]{./Results/hays_results/skewness_results/total_timeseries.pdf}
\includegraphics[width=6 cm]{./Results/knz_results/skewness_results/total_timeseries_soiltype_t.pdf}\\
\caption[The aggregate quantity, $\xtot$, for (A) Hays and (B) Konza data]{The aggregate quantity, $\xtot$, for (A) Hays and (B) Konza data, 
compared to the value obtained using only "common" species, or only "common" and "intermediate" species. Dynamics were very similar in all three
cases, justifying the claim that "common" species constitute the large majority of the community, and dominate the dynamics. 
See Methods for definitions of common, intermediate, and rare species. Total percent cover values in (B) are often above $100\%$ 
because species percent cover values for a year and a plot were computed as the maximum percent cover over multiple sampling 
occasions. See SI section \ref{Data} for details.\label{fig_spaceavg_timeseries}}
\end{center}
\end{figure}

\begin{figure}
\begin{center}
\includegraphics[width=4 cm]{./Results/pedagog_figs/LTUT_rho_0p8_UTdep.pdf}
\caption[Normalized rank plot with stronger association in the upper tails than in the lower]{A normalized rank 
plot with stronger association in the upper tails than in the lower, opposite to Fig. \ref{MT-fig_pedag_taildep}
of the main text.\label{fig_pedag_taildep_SM}}
\end{center}
\end{figure}

```{r reformatting_figs, echo=F, results="hide", cache=T, cache.extra=list(ts_mat_CP_hays,mtime("getmap.R"),mtime("Reform_suppmat_fig.R"))}
# Don't put any seed in this chunk
source("./Reform_suppmat_fig.R") 
```

\begin{figure}
\includegraphics[width=3in]{./Results/hays_results/skewness_results/pp_surrogs_hays_CP/Reform_maps/Map_1_2.pdf} 
\includegraphics[width=3in]{./Results/hays_results/skewness_results/pp_surrogs_hays_CP/Reform_maps/Map_2_8.pdf} \\
\includegraphics[width=3in]{./Results/hays_results/skewness_results/pp_surrogs_hays_CP/Reform_maps/Map_15_18.pdf}
\caption[Approximations of the $\varphi$ map]{Approximation of the $\varphi$ map for 
species 1 and 2 (A, top left), 2 and 8 (B, top right), and 15 and 18 (C, bottom left) 
of the Hays dataset. Dashed lines are $0.025^{th}$, $25^{th}$, $50^{th}$, $75^{th}$, and $97.5^{th}$ percentiles 
of stochastic realizations of the map, and
the red line and points show the expected value. The horizontal-axis values of the red 
points show the values of the
parameter for which the map $\varphi$ was evaluated, with $500$ evaluations used for
each parameter value.}\label{fig:varphiexample}
\end{figure}

\begin{figure}
\includegraphics[width=9 cm]{./Results/hays_results/skewness_results/pp_surrogs_hays_CP/comparison_histplot_PPsurrogs.pdf}
\includegraphics[width=9 cm]{./Results/knz_results/skewness_results/pp_surrogs_knz_t_CP/comparison_histplot_PPsurrogs.pdf}  
\caption[Checks on the surrogates, 1]{(A, B) Distributions of surrogate values 
$\sum_{i_2>i_1} \cov(x_{i_1}^{(m)}(t),x_{i_2}^{(m)}(t))$ 
compared to the value $\sum_{i_2>i_1} \cov(x_{i_1}(t),x_{i_2}(t))$ for Hays (A) and Konza (B),
where surrogate datasets were indexed by $m=1,\ldots,10000$. 
(C, D) Distributions of surrogate values $\var(\sum_i x_i^{(m)}(t))$ compared
to the value $\var(\sum_i x_i(t))$ for Hays (C) and Konza (D). 
(E, F) Distributions of surrogate values 
$\var(\sum_i x_i^{(m)}(t))/ \sum_i(\var x_i^{(m)}(t))$ compared
to the value $\var(\sum_i x_i(t))/ \sum_i(\var x_i(t))$ for Hays (E) and Konza (F). 
The fraction, p, of surrogate values less than 
the data value is at the top of each panel.}\label{fig:check_cov_var_vr}
\end{figure}

\begin{figure}
\textbf{ \hspace{4.5 cm} (A) \hspace{8 cm} (B)}\\
\includegraphics[width=9 cm]{./Results/hays_results/skewness_results/pp_surrogs_hays_CP/comparison_pairwisecor_PPsurrogs.pdf}  
\includegraphics[width=9 cm]{./Results/knz_results/skewness_results/pp_surrogs_knz_t_CP/comparison_pairwisecor_PPsurrogs.pdf} 
\caption[Checks on the surrogates, 2]{For each pair of species $i_1<i_2$, the distribution of values 
$\cor(x_{i_1}^{(m)}(t),x_{i_2}^{(m)}(t))$ for $m=1,\ldots,10000$ was plotted against the value
$\cor(x_{i_1}(t),x_{i_2}(t))$ for Hays (A) and Konza (B). Distributions of surrogate 
correlations were rendered by showing their $2.5^{th}$ and $97.5^{th}$ percentiles (extents of
vertical lines) and their median (dots). Bars overlapping with the 1-1 line were rendered in black,
and those not overlapping were rendered in red. }\label{fig:check_pairwisecor}
\end{figure}

<!--Results with corl-coru matrix plot-->
\begin{figure*}
\begin{center}
\textbf{ \hspace{-7 cm} (A) \hspace{7 cm} (B)}\\
\includegraphics[width=8cm]{./Results/hays_results/corstat_hays_spaceavg_results/Corl-Coru_ub_0.5.pdf}
\includegraphics[width=8cm]{./Results/knz_results/corstat_knz_spaceavg_results/soiltype_t/Corl-Coru_ub_0.5.pdf} 
\caption[Measures of asymmetry of tail association for pairs of species]{Measures of asymmetry of 
tail association for all pairs of species with positive Spearman correlation,
using the Hays (A) and Konza (B) datasets. The quantity $\cor_l-\cor_u$ (Methods) is displayed, using a blue/red split colorbar,
for all positively associated pairs of species. Green-shaded species pairs were negatively associated. See
Tables \ref{tab_spinfo_hays}-\ref{tab_spinfo_knz} for species names. Although both lower- and upper-tail
association occur in both datasets, the quantities $n_L$, $n_U$ and $\Atot$ (Methods), displayed under each 
panel, indicate that lower-tail association between species was stronger or more common in the Hays data, and 
upper-tail association was stronger or more common in the Konza data. \label{fig_CorlmCoru}}
\end{center}
\end{figure*}

<!--Skewness results for both datasets-->
\begin{figure}
\begin{center}
\textbf{ \hspace{-5 cm} (A) \hspace{7 cm} (B)}\\
\includegraphics[width=15cm]{./Results/hays_results/skewness_results/hays_pearson_preserving_results_cvsq_skw_plots.pdf}\\
\textbf{ \hspace{-5 cm} (C) \hspace{7 cm} (D)}\\
\includegraphics[width=15cm]{./Results/knz_results/skewness_results/knz_pearson_preserving_results_cvsq_skw_plots.pdf}
\hspace{1 cm}
\includegraphics[width=16cm]{./Results/hays_results/skewness_results/legend_plot.pdf}
\caption[Elaboration of some results of Fig. \ref{MT-fig_result_box}]{Squared coefficients of variation and 
skewness values for $\xtot$ for data, surrogates, and the independent-species null hypothesis, for Hays (A, B) 
and Konza (C, D). Panels A and C show the values of $\CVcomsq$ (black dot) and $\CVindsq$ (dashed vertical line)
for Hays and Konza, as well as the distribution of values of the squared coefficient of variation of $\sum_i x_i^{(m)}$ 
over all surrogate datasets $m=1,\ldots,M$ (Methods). Dotted lines give the $2.5\%$ and $97.5\%$ quantiles
of these distributions, and the triangle symbols give the medians. Panels B and D show the values of 
$\Scom$ (black dot) and $\Sind$ (dashed vertical line) for Hays and Konza, as well as the distribution of
values of $\sk(\sum_i x_i^{(m)})$ over all surrogate datasets $m=1,\ldots,M$ (Methods). Dotted lines again
give quantiles and the triangles give medians. \label{fig_result_box}}
\end{center}
\end{figure}

\begin{figure}
\begin{center}
\includegraphics[width=15cm]{./Results/Hays_ecdfs.pdf}
\caption[Empirical CDF of $\xtot$ compared to surrogates, Hays]{Comparison of the ECDF $\hat{F}(x)$ of $\xtot$ to 
the surrogate-based quantities $\hat{F}^{(m)}(x)$, $m=1,\ldots,10000$, for Hays (see SI section \ref{ecdf_approach}). 
Red points display $\hat{F}(x)$ at the values of $\xtot$. For each of the
quantities $1/T, 2/T,\ldots,(T-1)/T,T/T=1$, black lines show the distribution of $x$ values at which
surrogate ECDFs $\hat{F}^{(m)}(x)$ first reached that $y$ axis value, using the surrogates of SI section \ref{surrogs} (A)
or independent surrogates (B). Minima and maxima (dotted lines), $2.5$th and $97.5$th percentiles (dashed lines) and the median 
(solid line) of the distribution are shown for each $y$ axis values. Numbers at the right show the fraction of surrogate
values less than the empirical value, and are rendered in red whenever they are outside the range $(0.05,0.95)$. 
See SI section \ref{ecdf_approach} for interpretations. \label{hays_ecdf}}
\end{center}
\end{figure}

\begin{figure}
\begin{center}
\includegraphics[width=15cm]{./Results/Konz_ecdfs.pdf}
\caption[Empirical CDF of $\xtot$ compared to surrogates, Konza]{Comparison of the ECDF $\hat{F}(x)$ of $\xtot$ to 
the surrogate-based quantities $\hat{F}^{(m)}(x)$, $m=1,\ldots,10000$, for Konza (see SI section \ref{ecdf_approach}). 
Red points display $\hat{F}(x)$ at the values of $\xtot$. For each of the
quantities $1/T, 2/T,\ldots,(T-1)/T,T/T=1$, black lines show the distribution of $x$ values at which
surrogate ECDFs $\hat{F}^{(m)}(x)$ first reached that $y$ axis value, using the surrogates of SI section \ref{surrogs} (A)
or independent surrogates (B). Minima and maxima (dotted lines), $2.5$th and $97.5$th percentiles (dashed lines) and the median 
(solid line) of the distribution are shown for each $y$ axis values. Numbers at the right show the fraction of surrogate
values less than the empirical value, and are rendered in red whenever they are outside the range $(0.05,0.95)$. 
See SI section \ref{ecdf_approach} for interpretations. \label{konz_ecdf}}
\end{center}
\end{figure}


# References

\indent