library(fabia)

library(permute)

source("/seppdata/sepp/linkage/release/scripts/funct.R")

load(file="/seppdata/sepp/linkage/release/annotation/individuals.Rda")


minruns <- 1       ## min number or runs
maxruns <- 100     ## max number or runs
snps <- 10000      ## number of SNPs
samplesN <- 1000    ## number of individuals
ShunkLength <- 5000 ## length suffeled shunks of original data
noImplanted <- 20 ## how many haplotypes
implanted <- 10 ## in how many samples
lengthI <- 10 ## IBD length in kb


noOverwrite <- FALSE ## noOverwrite=TRUE ensures that a haplotype is not superimposed by another haplotype

write(paste("maxruns: ",maxruns,sep=""),file="dataSimParameters.txt",ncolumns=10)
write(paste("snps: ",snps,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10)
write(paste("samplesN: ",samplesN,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10)
write(paste("ShunkLength: ",ShunkLength,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10)
write(paste("noImplanted: ",noImplanted,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10)
write(paste("implanted: ",implanted,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10)
write(paste("lengthI: ",lengthI,sep=""),file="dataSimParameters.txt",append=TRUE,ncolumns=10)



snpsAll <- 3201157
samplesAll <- 1092
haploAll <- 2184

shift <- 5000
intervallAll <- 10000
over <- intervallAll%/%shift

N1 <- snpsAll%/%shift
endRunAll <- (N1-over+1) # 639
startRunAll <- 0

startRun <- 1

endRun <- 639

nucleotide <- c("A","T","C","G")




haploN <- 2*samplesN
  
dataA <- 1:samplesN

g2 <- 2*(1:snps)
g1 <- g2 -1


s2 <- 2*dataA
s1 <- s2-1


id <- as.character(dataA)
zeroid <-  as.character(rep(0,samplesN))

famID <- id
dim(famID) <- c(samplesN,1)
ID <- id
dim(ID) <- c(samplesN,1)
patID <- zeroid 
dim(patID) <- c(samplesN,1)
matID <- zeroid 
dim(matID) <- c(samplesN,1)
sex <- zeroid 
dim(sex) <- c(samplesN,1)
phen <- zeroid 
dim(phen) <- c(samplesN,1)



snpNames <- as.character(1:snps)

chr <- rep(1,snps)






for (run in minruns:maxruns) {

  write(noImplanted,file=paste("dataSim",run,"Impl.txt",sep=""),ncolumns=10)


  posAll <- sample(startRun:endRun,1)  

  start <- posAll*shift
  end <- start + intervallAll
  
  if (end > snpsAll)  {
    
    end <- snpsAll
  }


  
  pRange <- paste("_",format(start,scientific=FALSE),"_",format(end,scientific=FALSE),sep="")

  write(pRange,file=paste("dataSimN",run,"Impl.txt",sep=""),ncolumns=10)

  annot <- read.table(paste("../shuffle",pRange,"_annot.txt",sep=""),header = FALSE, sep = " ", quote = "",as.is = TRUE,skip=2)



  for (i in 3:9) {

    annot[[i]] <- gsub(",",";",annot[[i]])

  }

  for (i in 4:5) {
  
    annot[[i]] <- gsub("<","A",annot[[i]])
    annot[[i]] <- gsub(">","A",annot[[i]])
  }

  pos <- annot[,2]
  ref <- substr(annot[,4],1,1)
  alt <- substr(annot[,5],1,1)

  Dpos <- c(sample(50:150,1),diff(pos))


 Zannot <- read.table(paste("/seppdata/sepp/linkage/release/split/ALL.chr1.merged_beagle_mach.20101123.snps_indels_svs.genotypes",pRange,"_annot.txt",sep=""),header = FALSE, sep = " ", quote = "",as.is = TRUE,skip=2)



  for (i in 3:9) {

    Zannot[[i]] <- gsub(",",";",Zannot[[i]])

  }

  for (i in 4:5) {
  
    Zannot[[i]] <- gsub("<","A",Zannot[[i]])
    Zannot[[i]] <- gsub(">","A",Zannot[[i]])
  }

  Zpos <- Zannot[,2]
  Zref <- substr(Zannot[,4],1,1)
  Zalt <- substr(Zannot[,5],1,1)

  ZDpos <- c(sample(50:150,1),diff(Zpos))


 
  alleleI <- matrix(0,haploN,snps)
  alleleN <- matrix("N",haploN,snps)
  for (i in 1:haploN) {
    alleleN[i,] <- ref
  }

  file <- paste("../shuffle",pRange,"_mat.txt",sep="")

 file1 <- paste("/seppdata/sepp/linkage/release/split/ALL.chr1.merged_beagle_mach.20101123.snps_indels_svs.genotypes",pRange,"_mat.txt",sep="")
 
  minors <- as.vector(rep(0,haploN))


  selectI <- sample(samplesAll,samplesN)
  selectI <- sort(selectI)
  readin <- as.vector(unlist(rbind(2*selectI-1,2*selectI)))
  write(selectI,file=paste("dataSimN",run,"Impl.txt",sep=""),append=TRUE,ncolumns=1000)
  write(readin,file=paste("dataSimN",run,"Impl.txt",sep=""),append=TRUE,ncolumns=2000)

  for (i in 1:haploN) {
    i1 <- readin[i]
    minors[i] <- as.integer(scan(file,what="integer",nlines=1,skip=3*(i1-1)+2))
    tmp <- as.integer(scan(file,what="integer",nlines=1,skip=3*(i1-1)+3))
    alleleI[i,tmp] <- 1
    alleleN[i,tmp] <- alt[tmp]
  }

  Ipos <- diffinv(Dpos)[2:(snps+1)]


  implantIL <- list()
  implantNL <- list()
  startN <- list()
  endN <- list()
  fromN <- list()
  lengthN <- list()
  
  for (noIm in 1:noImplanted) {

  notfoundIBD <- TRUE
  while (notfoundIBD) {

  startN[[noIm]] <- sample((snps-500),1)

  DZpos <- Zpos-Zpos[startN[[noIm]]]
  endN[[noIm]] <- max(which(DZpos<lengthI*1000))
  
  fromN[[noIm]] <- sample(haploN,1)

  inter <- startN[[noIm]]:endN[[noIm]]

  lN <- length(inter)

  vv <- rep(0,snps)
  tmp <- as.integer(scan(file1,what="integer",nlines=1,skip=3*(fromN[[noIm]]-1)+3))
  vv[tmp] <- 1
  
  Timp <- vv[inter]

  Timp[1] <- 1
  Timp[lN] <- 1
  
  lengthN[[noIm]] <- lN
  implantIL[[noIm]] <- Timp

  a1 <- which(implantIL[[noIm]]>0)
  if (length(a1)>10) {
    notfoundIBD <- FALSE
  }
  }
  
  }



  physPos <- diffinv(Dpos)[2:(snps+1)]
  snpMajor <- ref
  snpMinor <- alt
  DphysPos <- Dpos
  
  
  freq <- colSums(alleleI)/haploN
  pos <- physPos
  pos1 <- pos/1000000


  plinkCols <- cbind(famID,ID,patID,matID,sex,phen)

  plinkmap <- cbind(chr,snpNames,pos1,pos)

  plinkmap[,1] <- as.integer(plinkmap[,1])
  plinkmap[,2] <- as.integer(plinkmap[,2])
  plinkmap[,3] <- format(as.double(plinkmap[,3]),nsmall=6)
  plinkmap[,4] <- as.integer(plinkmap[,4])

  plinkLine <- c("FID","IID","PAT","MAT","SEX","PHENOTYPE",snpNames)


  mcmc <- rbind(pos1,freq)


  
  initM <- matrix(0,samplesN,2*snps)


 freq <- colSums(alleleI)/haploN

  
  alleleIimp <- alleleI
  alleleNimp <- alleleN

  allinter <- list()
  allimpl <- list()

 for (noIm in 1:noImplanted) {

  notfoundIBD <- TRUE
  while (notfoundIBD) {

  notfoundLong <- TRUE
  while (notfoundLong) {
   
  start <- sample((snps-lengthN[[noIm]]-1),1)
  
  end <- start+lengthN[[noIm]]-1

  if (pos[end]-pos[start]>1.5*ShunkLength) {
  notfoundLong <- FALSE
  }
  
  }
  inter <- start:end
 
  implantI <- implantIL[[noIm]]
        
  allinter[[noIm]] <- inter

  implantN <- ref[inter]
  a1 <- which(implantI>0)
  intt <- inter[a1]
  implantN[a1] <- alt[intt]

  implantNL[[noIm]] <- implantN

  impl <- sample(haploN,implanted)

  allimpl[[noIm]] <- impl

  notfoundIBD <- FALSE

  a2 <- freq[intt]
  a3 <- which(a2<0.05)
  if (length(a3)<6) {
    notfoundIBD <- TRUE
  }
  
  if ((noIm>1)&&(noOverwrite)) {
    for (no2Im in 1:(noIm-1)) {
      if ((length(intersect(impl,allimpl[[no2Im]]))>0)&&(length(intersect(inter,allinter[[no2Im]]))>0)) {
        notfoundIBD <- TRUE
      }
    }
  }


  }

  
  impl=sort(impl)

  implG <- (impl+1)%/%2
  write(impl,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=100)
  write(implG,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=100)
  write(inter,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=1000)
  write(implantI,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=1000)
  write(implantN,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=1000)

  intervalImp <- matrix(0,implanted,4)
  
  
  for (i in 1:implanted) {


    alleleIimp[impl[i],inter] <- implantI
    alleleNimp[impl[i],inter] <- implantN

    interImp <- c(start,end,1,lengthN[[noIm]])
    intervalImp[i,] <-  interImp
    
  }
  write(interImp,file=paste("dataSim",run,"Impl.txt",sep=""),append=TRUE,ncolumns=100)



}
                            

alleleIimpGeno <- alleleIimp[s1,] + alleleIimp[s2,]

dummyL <- 1:snps
dummyC <- rep(0,snps)
dummy <- as.character(dummyL)

annot <- list()

annot[[1]] <- dummyL
annot[[2]] <- pos
annot[[3]] <- snpNames
annot[[4]] <- snpMajor
annot[[5]] <- snpMinor
annot[[6]] <- dummy
annot[[7]] <- dummy
annot[[8]] <- dummy
annot[[9]] <- dummy
annot[[10]] <- freq
annot[[11]] <- dummyC


 save(snps,haploN,samplesN,dataA,pos,pos1,snpNames,snpMajor,snpMinor,freq,annot,file=paste("dataSim",run,"Anno.Rda",sep=""))
  
 save(impl,implG,inter,implantI,implantN,intervalImp,file=paste("dataSim",run,"Impl.Rda",sep=""))
  
  # BEAGLE
  ########
  
  v1 <- rep(1,haploN)
  dim(v1) <- c(1,haploN)

  v0 <- as.numeric(rbind(dataA,dataA))
  dim(v0) <- c(1,haploN)



  dataB1 <- rbind(v0,v1,t(alleleNimp))

  col1 <- c("id","BC58",snpNames)

  dim(col1) <- c((snps+2),1)

  col2 <- rep("M",snps)

  col2 <- c("I","A",col2)

  dim(col2) <- c((snps+2),1)


  dataB <- cbind(col2,col1,dataB1)

  write.table(dataB,file=paste("dataSim",run,"beagle.txt",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE)




  # PLINK
  #######

  dataP1 <- matrix("character",nrow=samplesN,ncol=2*snps)
  

  dataP1[,g1] <- alleleNimp[s1,]
  dataP1[,g2] <- alleleNimp[s2,]

  
  dataP <- cbind(plinkCols,dataP1)
  
  write.table(dataP,file=paste("dataSim",run,"plink.ped",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE)



  write.table(plinkmap,file=paste("dataSim",run,"plink.map",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep="\t")


  write.table(plinkCols,file=paste("dataSim",run,"plink.fam",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep="\t")





  
   #MCMC
   #####

  write.table(alleleIimpGeno,file=paste("dataSim",run,"mcmc.genotype",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep="\t")
  write.table(mcmc,file=paste("dataSim",run,"mcmc.posmaf",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep="\t")
  write.table(initM,file=paste("dataSim",run,"mcmc.initz",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep="\t")


   #RELATE
   ########


  write.table((alleleIimpGeno+1),file=paste("dataSim",run,"relate.geno",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE)

  write.table(pos1,file=paste("dataSim",run,"relate.pos",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE)
  write.table(chr,file=paste("dataSim",run,"relate.chr",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE)



  # fastPHASE
  #######

  write(as.integer(samplesN),file=paste("dataSim",run,"fastPHASE.txt",sep=""),ncolumns=10)
  write(as.integer(snps),file=paste("dataSim",run,"fastPHASE.txt",sep=""),append=TRUE,ncolumns=10)


  dataFP <- matrix("",nrow=3*samplesN,ncol=snps)

  f3 <- 3*dataA
  f2 <- f3-1
  f1 <- f3-2

  dataInd <- matrix("",nrow=samplesN,ncol=snps)
  dataInd[,1] <- paste("# id ",dataA)
                  
                 
                 
  temp1 <- as.matrix(alleleIimpGeno)
  temp1[which(temp1>1,arr.ind=TRUE)] <- 1
  temp2 <- as.matrix(alleleIimpGeno)
  temp2 <- temp2-1
  temp2[which(temp2<0,arr.ind=TRUE)] <- 0
  

  dataFP[f1,] <- dataInd
  dataFP[f2,] <- temp2
  dataFP[f3,] <- temp1
  
  dataFP[which(dataFP=="0",arr.ind=TRUE)] <- "0 "
  dataFP[which(dataFP=="1",arr.ind=TRUE)] <- "1 "

                 
  write.table(dataFP,file=paste("dataSim",run,"fastPHASE.txt",sep=""),quote=FALSE,row.names=FALSE,col.names=FALSE,append=TRUE,sep="")


  #FABIA
  ######



  write(as.integer(haploN),file=paste("dataSim",run,"fabia.txt",sep=""),ncolumns=10)
  write(as.integer(snps),file=paste("dataSim",run,"fabia.txt",sep=""),append=TRUE,ncolumns=10)

  for (i in 1:haploN) {
    
    a1 <- which(alleleIimp[i,]>0.01)
    
    al <- length(a1)
    b1 <- alleleIimp[i,a1]

    a1 <- a1 - 1
    dim(a1) <- c(1,al)
    dim(b1) <- c(1,al)

    write(al,file=paste("dataSim",run,"fabia.txt",sep=""),append=TRUE,ncolumns=10)
    write(a1,file=paste("dataSim",run,"fabia.txt",sep=""),append=TRUE,ncolumns=10000)
    write(format(as.double(b1),nsmall=1),file=paste("dataSim",run,"fabia.txt",sep=""),append=TRUE,ncolumns=10000)
    
  }





}
                            



