# This program is simply designed to take the wordlist found at: # http://www.artsci.wustl.edu/~bkessler/SensPhonotactic/dictAlign # and convert its transcription scheme to the one used in CELEX, # for easier comparison of statistics in the two corpora $input_file = "dictAlign.txt"; open (INFILE, $input_file) or die "Warning! Can't open input file: $!\n"; $vowels = "(\@r|[aeiouAEIOU&V\@3̯¿®ŽYšWo•‘ŸBML])"; # First, get the CELEX frequencies (so we can rule out some of the really weird stuff # from Kessler & Treiman's list) open (CELEXFILE, "CelexWordFreqs.txt") or die "Warning! Can't open CELEX file: $!\n"; while ($line = ) { chomp($line); ($freq, $orthography) = split("\t", $line); $celex_freq{$orthography} = $freq; } print "Freq\tCELEX Freq\tWord\tTranscription\tSyl template\n"; while ($line = ) { ($transcription, $orthography, $freq, $orthog, $aligned) = split("\t", $line); $transcription =~ s/T/tS/g; $transcription =~ s/D/dZ/g; $transcription =~ s/q/T/g; $transcription =~ s/Q/D/g; $transcription =~ s/G/N/g; # engma $transcription =~ s/H/w/g; # wh (merge with w) $transcription =~ s/i/i:/g; $transcription =~ s/y/I/g; $transcription =~ s/e/eI/g; # E is the same in both $transcription =~ s/a/&/g; # low front ae # Doesn't make sense to replace A with A:, lengthening to create # the British distinction. We can merge the "spa" and "cot" vowels, # at least, though. $transcription =~ s/O/A/g; $transcription =~ s/o/\@U/g; $transcription =~ s/c/O:/g; $transcription =~ s/u/u:/g; # U is the same in both $transcription =~ s/Y/aI/g; $transcription =~ s/C/OI/g; $transcription =~ s/W/aU/g; $transcription =~ s/R/\@r/g; $template = remove_digraphs($transcription); $template =~ s/$vowels/V/g; $template =~ s/[^V]/C/g; print "$freq\t$celex_freq{$orthography}\t$orthography\t[$transcription]\t$template\n"; $template_count{$template}++; } print "\n"; foreach $template (keys %template_count) { print "$template\t$template_count{$template}\n"; } sub remove_digraphs { $string = @_[0]; # Some digraphs indicate length redundantly on tense vowels; removing # the colon won't result in any neutralizations $string =~ s/i:/i/g; $string =~ s/A:/A/g; $string =~ s/u:/u/g; $string =~ s/3:/3/g; $string =~ s/A~:/Ì/g; $string =~ s/O~:/¯/g; # Some tense vowels have lax correspondents with the same symbol; # have to change $string =~ s/O:/¿/g; # Nasalized short and long ¾ doesn't even seem like a real distinction; # I'm going to neutralize them $string =~ s/&~(:)/®/g; $string =~ s/eI/Ž/g; $string =~ s/aI/Y/g; $string =~ s/OI/š/g; $string =~ s/aU/W/g; $string =~ s/\@U/o/g; # The following usually correspond to r in American English $string =~ s/I\@/•/g; $string =~ s/E\@/‘/g; $string =~ s/U\@/Ÿ/g; # Also some consonant digraphs $string =~ s/dZ/J/g; $string =~ s/tS/C/g; $string =~ s/n,/B/g; # totally arbitrary; N is taken $string =~ s/m,/M/g; $string =~ s/l,/L/g; $string =~ s/r\*/R/g; return $string; } sub replace_digraphs { $string = @_[0]; # Some digraphs indicate length redundantly on tense vowels; removing # the colon won't result in any neutralizations $string =~ s/i/i:/g; $string =~ s/A/A:/g; $string =~ s/u/u:/g; $string =~ s/3/3:/g; $string =~ s/Ì/A~:/g; $string =~ s/¯/O~:/g; # Some tense vowels have lax correspondents with the same symbol; # have to change $string =~ s/¿/O:/g; # Nasalized short and long ¾ doesn't even seem like a real distinction; # I'm going to neutralize them $string =~ s/®/&~/g; $string =~ s/Ž/eI/g; $string =~ s/Y/aI/g; $string =~ s/š/OI/g; $string =~ s/W/aU/g; $string =~ s/o/\@U/g; $string =~ s/•/I\@/g; $string =~ s/‘/E\@/g; $string =~ s/Ÿ/U\@/g; # Also some consonant digraphs $string =~ s/J/dZ/g; $string =~ s/C/tS/g; $string =~ s/B/n,/g; # totally arbitrary; N is taken $string =~ s/M/m,/g; $string =~ s/L/l,/g; $string =~ s/R/r\*/g; return $string; }