# This program is simply designed to take the wordlist found at:
# http://www.artsci.wustl.edu/~bkessler/SensPhonotactic/dictAlign
# and convert its transcription scheme to the one used in CELEX,
# for easier comparison of statistics in the two corpora
$input_file = "dictAlign.txt";
open (INFILE, $input_file) or die "Warning! Can't open input file: $!\n";
$vowels = "(\@r|[aeiouAEIOU&V\@3Ì¯¿®ŽYšWo•‘ŸBML])";

# First, get the CELEX frequencies (so we can rule out some of the really weird stuff
# from Kessler & Treiman's list)
open (CELEXFILE, "CelexWordFreqs.txt") or die "Warning! Can't open CELEX file: $!\n";
while  ($line = <CELEXFILE>) {
    chomp($line);    
    ($freq, $orthography) = split("\t", $line);
    $celex_freq{$orthography} = $freq;        
}

print "Freq\tCELEX Freq\tWord\tTranscription\tSyl template\n";

while ($line = <INFILE>) {
    ($transcription, $orthography, $freq, $orthog, $aligned) = split("\t", $line);        
    $transcription =~ s/T/tS/g;    
    $transcription =~ s/D/dZ/g;    
    $transcription =~ s/q/T/g;    
    $transcription =~ s/Q/D/g;    
    $transcription =~ s/G/N/g;    # engma    
    $transcription =~ s/H/w/g;    # wh (merge with w)    

    $transcription =~ s/i/i:/g;
    $transcription =~ s/y/I/g;
    $transcription =~ s/e/eI/g;        
    # E is the same in both
    $transcription =~ s/a/&/g;    # low front ae

    # Doesn't make sense to replace A with A:, lengthening to create 
    #   the British distinction.  We can merge the "spa" and "cot" vowels, 
    #   at least, though.
    $transcription =~ s/O/A/g;        
    
    $transcription =~ s/o/\@U/g;
    $transcription =~ s/c/O:/g;    
    $transcription =~ s/u/u:/g;    
    # U is the same in both

    $transcription =~ s/Y/aI/g;    
    $transcription =~ s/C/OI/g;   
    $transcription =~ s/W/aU/g;    
    $transcription =~ s/R/\@r/g;    
    
    $template = remove_digraphs($transcription);    
    $template =~ s/$vowels/V/g;    
    $template =~ s/[^V]/C/g;    

    print "$freq\t$celex_freq{$orthography}\t$orthography\t[$transcription]\t$template\n"; 
    $template_count{$template}++;        
    
}
print "\n";
foreach $template (keys %template_count) {
    print "$template\t$template_count{$template}\n";        
}


sub remove_digraphs {
    $string = @_[0];
    # Some digraphs indicate length redundantly on tense vowels; removing
    # the colon won't result in any neutralizations
    $string =~ s/i:/i/g;    
    $string =~ s/A:/A/g;    
    $string =~ s/u:/u/g;    
    $string =~ s/3:/3/g;        
    $string =~ s/A~:/Ì/g;        
    $string =~ s/O~:/¯/g;        

    # Some tense vowels have lax correspondents with the same symbol;
    # have to change
    $string =~ s/O:/¿/g;
  
    # Nasalized short and long ¾ doesn't even seem like a real distinction;
    # I'm going to neutralize them
    $string =~ s/&~(:)/®/g;
    $string =~ s/eI/Ž/g;
    $string =~ s/aI/Y/g;    
    $string =~ s/OI/š/g;        
    $string =~ s/aU/W/g;
    $string =~ s/\@U/o/g; 
    # The following usually correspond to r in American English
    $string =~ s/I\@/•/g;    
    $string =~ s/E\@/‘/g;        
    $string =~ s/U\@/Ÿ/g;    
    
    # Also some consonant digraphs
    $string =~ s/dZ/J/g;    
    $string =~ s/tS/C/g;    
    $string =~ s/n,/B/g; # totally arbitrary; N is taken
    $string =~ s/m,/M/g;      
    $string =~ s/l,/L/g;    
    $string =~ s/r\*/R/g; 
    
    return $string;    
}
sub replace_digraphs {
    $string = @_[0];
    # Some digraphs indicate length redundantly on tense vowels; removing
    # the colon won't result in any neutralizations
    $string =~ s/i/i:/g;    
    $string =~ s/A/A:/g;    
    $string =~ s/u/u:/g;    
    $string =~ s/3/3:/g;        
    $string =~ s/Ì/A~:/g;        
    $string =~ s/¯/O~:/g;        

    # Some tense vowels have lax correspondents with the same symbol;
    # have to change
    $string =~ s/¿/O:/g;
  
    # Nasalized short and long ¾ doesn't even seem like a real distinction;
    # I'm going to neutralize them
    $string =~ s/®/&~/g;
    $string =~ s/Ž/eI/g;
    $string =~ s/Y/aI/g;    
    $string =~ s/š/OI/g;        
    $string =~ s/W/aU/g;
    $string =~ s/o/\@U/g; 
    
    $string =~ s/•/I\@/g;    
    $string =~ s/‘/E\@/g;        
    $string =~ s/Ÿ/U\@/g;    

    # Also some consonant digraphs
    $string =~ s/J/dZ/g;    
    $string =~ s/C/tS/g;    
    $string =~ s/B/n,/g; # totally arbitrary; N is taken
    $string =~ s/M/m,/g;      
    $string =~ s/L/l,/g;    
    $string =~ s/R/r\*/g;        

    return $string;    
}