# This program computes the phonotactic probability metrics employed by # Jusczyk et al in their study of infants' knowledge of phonotactics # It uses the wordlist CelexWordsInTranscription.txt to calculate # probabilities from the English lexicon, and then applies these probabilities # to test items, to give predicted well-formedness scores. $count_monosyllables_only = 0; $vowels = "aeiouAEIOU&V\@3Ì¯¿®ŽYšWo•‘ŸBML"; print "\nCounting frequencies in wordlist\n"; open (WORDLIST, "CelexWordsInTranscription.txt") or die "Warning! Can't open wordlist: $!\n"; while ($line = ) { chomp($line); ($freq, $orthography, $transcription) = split("\t", $line); $original_transcription = $transcription; $transcription =~ s/^\[//; $transcription =~ s/\]$//; # It will help a lot to have every phoneme represented by a single character $transcription = remove_digraphs($transcription); @syllables = split(/\]\[/, $transcription); # Jusczyk et al don't tell us whether they calculated based on only monosyllables, # or all syllables of all words. # Let's leave it as an option, to try both ways if ($count_monosyllables_only and scalar @syllables > 1) { next; } foreach $syl (@syllables) { # First, adjust positional probabilities if ($syl =~ /([^$vowels]*)([$vowels])([^$vowels]*)/) { $onset = $1; $nucleus = $2; $coda = $3; } else { print "Error: can't parse syllable [$syl] into onset, nucleus, and coda in the word $original_transcription\n"; } # Jusczyk & al don't say what they do with onsetless or open syls when they # do their counts. Do they count "empty" as a separate character, so that if # most words have open syllables, the probability of any coda is small? # This seems quite reasonable, but I somehow doubt they did it. if ($onset eq "") { $onset = " "; } if ($coda eq "") { $coda = " "; } # Another thing that Jusczyk & al don't specify is how they handle counting # complex onsets and codas. They simplest thing is to count them as separate # entities, but another possibility would be to split them up and count each # member separately # for ($i = 0; $i < length($onset); $i++) { # $OnsetFreq{substr($onset,$i,1)}++; # } # for ($i = 0; $i < length($coda); $i++) { # $CodaFreq{substr($coda,$i,1)}++; # } $OnsetFreq{$onset} += log10($freq); $NucleusFreq{$nucleus} += log10($freq); $CodaFreq{$coda} += log10($freq); $total_onsets += log10($freq); $total_nuclei += log10($freq); $total_codas += log10($freq); # print "[$onset $nucleus $coda] "; } # print "\n"; } close (WORDLIST); # Now that we're done counting from the corpus, we can calculate the probabilities print "\nCalculating positional probabilities\n"; # First, the positional probabilities of each phoneme foreach $onset (keys %OnsetFreq) { $OnsetProb{$onset} = $OnsetFreq{$onset} / $total_onsets; } foreach $nucleus (keys %NucleusFreq) { $NucleusProb{$nucleus} = $NucleusFreq{$nucleus} / $total_nuclei; } foreach $coda (keys %CodaFreq) { $CodaProb{$coda} = $CodaFreq{$coda} / $total_codas; } # Let's save the results in a file open (POSPROBFILE, ">CelexPositionalProbabilities.txt") or die "Warning! Can't create positional probabilties file: $!\n"; # Now print the results, in decreasing probability: printf POSPROBFILE "*****************************\n Onset Probabilities\n*****************************\n"; foreach $onset (sort {$OnsetProb{$b} <=> $OnsetProb{$a}} keys %OnsetProb) { printf POSPROBFILE replace_digraphs($onset) . "\t$OnsetProb{$onset}\n"; } printf POSPROBFILE "\n*****************************\n Nucleus Probabilities\n*****************************\n"; foreach $nucleus (sort {$NucleusProb{$b} <=> $NucleusProb{$a}} keys %NucleusProb) { printf POSPROBFILE replace_digraphs($nucleus) . "\t$NucleusProb{$nucleus}\n"; } printf POSPROBFILE "\n*****************************\n Coda Probabilities\n*****************************\n"; foreach $coda (sort {$CodaProb{$b} <=> $CodaProb{$a}} keys %CodaProb) { printf POSPROBFILE replace_digraphs($coda) . "\t$CodaProb{$coda}\n"; } close (POSPROBFILE); # Finally, use these numbers to calculate probabilities for made-up words # (In this case, Jusczyk et al's experimental stimuli) # We have four files to go through, so I'll put it in a loop. print "\nDeriving predictions for test items\n"; @stimfiles = ("Exp3-Low.txt", "Exp3-High.txt", "Exp1-Low.txt", "Exp1-High.txt", "AlbrightHayes.txt"); foreach $file (@stimfiles) { open (STIMFILE, $file) or die "Warning! Can't open stimulus file: $!\n"; $file_prefix = $file; $file_prefix =~ s/.txt$//; open (PREDICTIONSFILE, ">$file_prefix-Predictions.txt") or die "Warning! Can't create file to store predictions for $file\n$!\n"; printf PREDICTIONSFILE "Stimulus\tAvg. Positional Prob.\n"; $number_of_items = 0; $errors = 0; $SummedPositionalProb = 0; $SummedBiphoneProb = 0; while ($line = ) { chomp($line); $word = remove_digraphs($line); $number_of_items++; # Calculate the positional probabilities # We will assume that all test words have just one syllable (since J & al's stims do) $CumulativeProb = 0; if ($word =~ /([^$vowels]*)([$vowels])([^$vowels]*)/) { $onset = $1; $nucleus = $2; $coda = $3; if ($onset eq "") { $onset = " "; } if ($nucleus eq "") { $nucleus = " "; } if ($coda eq "") { $coda = " "; } $SummedPositionalProb = ($OnsetProb{$onset} + $NucleusProb{$nucleus} + $CodaProb{$coda}); $CumulativePositionalProb += $SummedPositionalProb; } else { print "Error: can't parse syllable [$word] into onset, nucleus, and coda\n"; $SummedPositionalProb = ""; $errors++; } # Write the results to a file printf PREDICTIONSFILE "$line\t$SummedPositionalProb\n"; } printf PREDICTIONSFILE "\nAVERAGE:\t" . $CumulativePositionalProb / ($number_of_items - $errors) . "\n"; close (PREDICTIONSFILE); } sub log10 { my $n = @_[0]; return log($n) / log(10); } sub remove_digraphs { $string = @_[0]; # Some digraphs indicate length redundantly on tense vowels; removing # the colon won't result in any neutralizations $string =~ s/i:/i/g; $string =~ s/A:/A/g; $string =~ s/u:/u/g; $string =~ s/3:/3/g; $string =~ s/A~:/Ì/g; $string =~ s/O~:/¯/g; # Some tense vowels have lax correspondents with the same symbol; # have to change $string =~ s/O:/¿/g; # Nasalized short and long ¾ doesn't even seem like a real distinction; # I'm going to neutralize them $string =~ s/&~(:)/®/g; $string =~ s/eI/Ž/g; $string =~ s/aI/Y/g; $string =~ s/OI/š/g; $string =~ s/aU/W/g; $string =~ s/\@U/o/g; # The following usually correspond to r in American English $string =~ s/I\@/•/g; $string =~ s/E\@/‘/g; $string =~ s/U\@/Ÿ/g; # Also some consonant digraphs $string =~ s/dZ/J/g; $string =~ s/tS/C/g; $string =~ s/n,/B/g; # totally arbitrary; N is taken $string =~ s/m,/M/g; $string =~ s/l,/L/g; $string =~ s/r\*/R/g; return $string; } sub replace_digraphs { $string = @_[0]; # Some digraphs indicate length redundantly on tense vowels; removing # the colon won't result in any neutralizations $string =~ s/i/i:/g; $string =~ s/A/A:/g; $string =~ s/u/u:/g; $string =~ s/3/3:/g; $string =~ s/Ì/A~:/g; $string =~ s/¯/O~:/g; # Some tense vowels have lax correspondents with the same symbol; # have to change $string =~ s/¿/O:/g; # Nasalized short and long ¾ doesn't even seem like a real distinction; # I'm going to neutralize them $string =~ s/®/&~/g; $string =~ s/Ž/eI/g; $string =~ s/Y/aI/g; $string =~ s/š/OI/g; $string =~ s/W/aU/g; $string =~ s/o/\@U/g; $string =~ s/•/I\@/g; $string =~ s/‘/E\@/g; $string =~ s/Ÿ/U\@/g; # Also some consonant digraphs $string =~ s/J/dZ/g; $string =~ s/C/tS/g; $string =~ s/B/n,/g; # totally arbitrary; N is taken $string =~ s/M/m,/g; $string =~ s/L/l,/g; $string =~ s/R/r\*/g; return $string; }