import java.util.Random; import java.io.*; class Simulate { /* * Given the coverage and the real proportions, sample the individuals that would be selected. * returns an array of individual indexes with length coverage based on the actual proportion. */ private static int[] sample_proportion(double[] realproportion, short coverage, Random rand) { double total = 0; for(int i=0; i < realproportion.length; i++) { total += realproportion[i]; } double[] normalized_probabilities = new double[realproportion.length]; for(int i=0; i < realproportion.length; i++) { normalized_probabilities[i] = realproportion[i]/total; } double[] cutoff = new double[realproportion.length]; double cumulative = 0; for(int i=0; i < realproportion.length; i++) { cumulative += normalized_probabilities[i]; cutoff[i] = cumulative; } int[] return_indexes = new int[coverage]; for(short i=0; i < coverage; i++) { double randomDouble = rand.nextDouble(); for(int j=0; j < normalized_probabilities.length; j++) { if (randomDouble < cutoff[j]) { return_indexes[i] = j; break; } } } return return_indexes; } public static void main(String args[]) throws IOException { if (args.length < 3) { System.err.println("java Simulate "); System.err.println("java -jar Simulate.jar "); System.exit(1); } Random rand = new Random(1); int N = Integer.parseInt(args[0]); //No. of individuals int no_of_snps = Integer.parseInt(args[1]); short coverage = (short) Integer.parseInt(args[2]); //Sequencing coverage String outputfilename = args[3]; //outputfile PrintWriter writer = new PrintWriter(outputfilename, "UTF-8"); double[] realproportion = new double[N]; double[] workingproportion = new double[N]; int total = 0; int[] realproportion_count = new int[N]; for(int i = 0; i < N; i++) { int no_of_copies = rand.nextInt(10000); total += no_of_copies; realproportion_count[i] = no_of_copies; } for(int i = 0; i < N; i++) { workingproportion[i] = 1.0/N; realproportion[i] = ((double)realproportion_count[i])/total; System.err.println(i + ": True relative proportion: " + realproportion[i]); if (i==0) { writer.print(realproportion[i]); } else { writer.print("\t" + realproportion[i]); } } writer.println(); //System.err.println("Simulating " + N + " of SNPs..."); byte[][] genotypes = new byte[no_of_snps][N]; short[][] reads = new short[no_of_snps][2]; for(int i=0; i < no_of_snps; i++) { double minor_allele_frequency = (double) (5 + rand.nextInt(45))/100; //System.err.println(i + ": SNP, MAF is " + minor_allele_frequency); for(int j=0; j < N; j++) { boolean a1; boolean a2; a1 = (rand.nextDouble() > minor_allele_frequency); a2 = (rand.nextDouble() > minor_allele_frequency); if (!a1 && !a2) { genotypes[i][j] = 0; } else if (!a1 && a2) { genotypes[i][j] = 1; } else if (a1 && !a2) { genotypes[i][j] = 1; } else if (a1 && a2) { genotypes[i][j] = 2; } } short A_count = 0; short B_count = 0; int[] sampled_indexes = sample_proportion(realproportion, coverage, rand); for (int j=0; j < sampled_indexes.length; j++) { byte this_genotype = genotypes[i][sampled_indexes[j]]; if (this_genotype == 0) { //definitely A A_count++; } else if (this_genotype == 2) { //definitely B B_count++; } else { //maybe A or B since it is diploid double coinflip = rand.nextDouble(); if (coinflip < 0.5) { A_count++; } else { B_count++; } } } reads[i][0] = A_count; reads[i][1] = B_count; } for(int i=0; i < no_of_snps; i++) { for(int j=0; j < N; j++) { //System.err.print(genotypes[i][j]); } //System.err.println(); } //Perform EM algorithm int no_of_iterations = 2000; double[] A_likelihood; double[] B_likelihood; for(int counter=0; counter < no_of_iterations; counter++) { A_likelihood = new double[N]; B_likelihood = new double[N]; for(int i=0; i < N; i++) { A_likelihood[i] = 0; B_likelihood[i] = 0; } for(int snp_index = 0; snp_index < no_of_snps; snp_index++) { short A_count = reads[snp_index][0]; short B_count = reads[snp_index][1]; double A_total = 0; double B_total = 0; for(int i=0; i < N; i++) { short this_genotype = genotypes[snp_index][i]; if (this_genotype == 0) { A_total += workingproportion[i]; } else if (this_genotype == 1) { A_total += 0.5 * workingproportion[i]; B_total += 0.5 * workingproportion[i]; } else if (this_genotype == 2) { B_total += workingproportion[i]; } } for(int i=0; i < N; i++) { short this_genotype = genotypes[snp_index][i]; if (this_genotype == 0) { if (A_total > 0 && A_count > 0) { A_likelihood[i] += (workingproportion[i]/A_total) * A_count; } } else if (this_genotype == 1) { if (A_total > 0 && A_count > 0) { A_likelihood[i] += ((0.5 * workingproportion[i])/A_total) * A_count; } if (B_total > 0 && B_count > 0) { B_likelihood[i] += ((0.5 * workingproportion[i])/B_total) * B_count; } } else if (this_genotype == 2) { if (B_total > 0 && B_count > 0) { B_likelihood[i] += (workingproportion[i]/B_total) * B_count; } } } } //Determine next working_proportion based on likelihoods double total_to_divide_by = 0; for (int i=0; i < N; i++) { total_to_divide_by += A_likelihood[i]; total_to_divide_by += B_likelihood[i]; } System.err.println("Counter: " + counter); for (int i=0; i < N; i++) { workingproportion[i] = (A_likelihood[i] + B_likelihood[i])/total_to_divide_by; System.err.println(i + ":Real_Proportion is " + realproportion[i] + " Working Proportion is " + workingproportion[i] + "\n"); if (i == 0) { writer.print(workingproportion[i]); } else { writer.print("\t" + workingproportion[i]); } } writer.println(); } writer.close(); } }