# Program Name: piS # it so that it calculates pi (the mean pairwise difference) and # S, the number of segregating sites. # # The format for a *.seq file is like this: # sequence_name1 SEQUENCE1 # sequence_name2 SEQUENCE2 # for example, # Comments are delimited by sharp signs, as in the file you are now # reading. # Line 1: Contains number of samples, length of each sequence # Remaining lines have two fields: # Field 1: name of sequence # Field 2: the sequence # Example: # hiv1 GACTGACTGATCG # hiv2 GCATGCTAGCTAG # # Caveats: Sequences need - to be aligned # - to be the same length # - to be on one line (lines can run over the edge, like this one). # BEGIN { ngenes = 0; } NF==0 { # skip empty lines next } NR==1 { # read the number of genes and of sites G = $1; # number of genes nsites = $2; # number of sites } NR>1 { if(NF != 2){ printf("ERROR: line %d has %d fields (should have 2)\n", NR, NF); printf("bad line: \"%s\"\n", $0); exit; } # printf("%d: [%s] [%s]\n", NR, $1, $2); # put sequences into an array ++ngenes; sequence[ngenes]=$2; } END{ # Did we read the right number of sequences? if(ngenes != G){ printf("ERROR read %d sequences; expected %d\n", ngenes, G); exit; } # Check for uneven sequence length for (i=1; i<=G; i++){ if (length(sequence[i]) != nsites){ printf("ERROR - length(%d)=%d; nsites=%d\n", i, length(sequence[i]), nsites); exit; } } # Using the first sequence as a reference, see if any sites # in each other sequence are different. If a site is different, # mark it in the varsites array. In the varsites array, any # cell containing a value > 0 is variable. split(sequence[1], refseq, ""); for (i=2; i<=G; i++){ split(sequence[i], compseq, ""); for (j=1; j<=nsites; j++){ if (refseq[j] != compseq[j]) varsites[j] = 1; } } # Count segregating sites. for (i=1; i<=nsites; i++){ S += varsites[i]; } # Count differences between each pair of sequences printf("Doing %d pairwise comparisons...be patient\n", G*(G-1)/2); for(i=1; i<=G; i++){ printf("%2d:\r", i); split(sequence[i], seqi, ""); for(j=1; j