# This program computes the phonotactic probability metrics employed by # Jusczyk et al in their study of infants' knowledge of phonotactics # It uses the wordlist CelexWordsInTranscription.txt to calculate # probabilities from the English lexicon, and then applies these probabilities # to test items, to give predicted well-formedness scores. \$count_monosyllables_only = 0; \$vowels = "aeiouAEIOU&V\@3Ì¯¿®ŽYšWo•‘ŸBML"; print "\nCounting frequencies in wordlist\n"; open (WORDLIST, "CelexWordsInTranscription.txt") or die "Warning! Can't open wordlist: \$!\n"; while (\$line = ) { chomp(\$line); (\$freq, \$orthography, \$transcription) = split("\t", \$line); \$original_transcription = \$transcription; \$transcription =~ s/^\[//; \$transcription =~ s/\]\$//; # It will help a lot to have every phoneme represented by a single character \$transcription = remove_digraphs(\$transcription); @syllables = split(/\]\[/, \$transcription); # Jusczyk et al don't tell us whether they calculated based on only monosyllables, # or all syllables of all words. # Let's leave it as an option, to try both ways if (\$count_monosyllables_only and scalar @syllables > 1) { next; } foreach \$syl (@syllables) { # First, adjust positional probabilities if (\$syl =~ /([^\$vowels]*)([\$vowels])([^\$vowels]*)/) { \$onset = \$1; \$nucleus = \$2; \$coda = \$3; } else { print "Error: can't parse syllable [\$syl] into onset, nucleus, and coda in the word \$original_transcription\n"; } # Jusczyk & al don't say what they do with onsetless or open syls when they # do their counts. Do they count "empty" as a separate character, so that if # most words have open syllables, the probability of any coda is small? # This seems quite reasonable, but I somehow doubt they did it. if (\$onset eq "") { \$onset = " "; } if (\$coda eq "") { \$coda = " "; } # Another thing that Jusczyk & al don't specify is how they handle counting # complex onsets and codas. They simplest thing is to count them as separate # entities, but another possibility would be to split them up and count each # member separately # for (\$i = 0; \$i < length(\$onset); \$i++) { # \$OnsetFreq{substr(\$onset,\$i,1)}++; # } # for (\$i = 0; \$i < length(\$coda); \$i++) { # \$CodaFreq{substr(\$coda,\$i,1)}++; # } \$OnsetFreq{\$onset} += log10(\$freq); \$NucleusFreq{\$nucleus} += log10(\$freq); \$CodaFreq{\$coda} += log10(\$freq); \$total_onsets += log10(\$freq); \$total_nuclei += log10(\$freq); \$total_codas += log10(\$freq); # print "[\$onset \$nucleus \$coda] "; } # print "\n"; } close (WORDLIST); # Now that we're done counting from the corpus, we can calculate the probabilities print "\nCalculating positional probabilities\n"; # First, the positional probabilities of each phoneme foreach \$onset (keys %OnsetFreq) { \$OnsetProb{\$onset} = \$OnsetFreq{\$onset} / \$total_onsets; } foreach \$nucleus (keys %NucleusFreq) { \$NucleusProb{\$nucleus} = \$NucleusFreq{\$nucleus} / \$total_nuclei; } foreach \$coda (keys %CodaFreq) { \$CodaProb{\$coda} = \$CodaFreq{\$coda} / \$total_codas; } # Let's save the results in a file open (POSPROBFILE, ">CelexPositionalProbabilities.txt") or die "Warning! Can't create positional probabilties file: \$!\n"; # Now print the results, in decreasing probability: printf POSPROBFILE "*****************************\n Onset Probabilities\n*****************************\n"; foreach \$onset (sort {\$OnsetProb{\$b} <=> \$OnsetProb{\$a}} keys %OnsetProb) { printf POSPROBFILE replace_digraphs(\$onset) . "\t\$OnsetProb{\$onset}\n"; } printf POSPROBFILE "\n*****************************\n Nucleus Probabilities\n*****************************\n"; foreach \$nucleus (sort {\$NucleusProb{\$b} <=> \$NucleusProb{\$a}} keys %NucleusProb) { printf POSPROBFILE replace_digraphs(\$nucleus) . "\t\$NucleusProb{\$nucleus}\n"; } printf POSPROBFILE "\n*****************************\n Coda Probabilities\n*****************************\n"; foreach \$coda (sort {\$CodaProb{\$b} <=> \$CodaProb{\$a}} keys %CodaProb) { printf POSPROBFILE replace_digraphs(\$coda) . "\t\$CodaProb{\$coda}\n"; } close (POSPROBFILE); # Finally, use these numbers to calculate probabilities for made-up words # (In this case, Jusczyk et al's experimental stimuli) # We have four files to go through, so I'll put it in a loop. print "\nDeriving predictions for test items\n"; @stimfiles = ("Exp3-Low.txt", "Exp3-High.txt", "Exp1-Low.txt", "Exp1-High.txt", "AlbrightHayes.txt"); foreach \$file (@stimfiles) { open (STIMFILE, \$file) or die "Warning! Can't open stimulus file: \$!\n"; \$file_prefix = \$file; \$file_prefix =~ s/.txt\$//; open (PREDICTIONSFILE, ">\$file_prefix-Predictions.txt") or die "Warning! Can't create file to store predictions for \$file\n\$!\n"; printf PREDICTIONSFILE "Stimulus\tAvg. Positional Prob.\n"; \$number_of_items = 0; \$errors = 0; \$SummedPositionalProb = 0; \$SummedBiphoneProb = 0; while (\$line = ) { chomp(\$line); \$word = remove_digraphs(\$line); \$number_of_items++; # Calculate the positional probabilities # We will assume that all test words have just one syllable (since J & al's stims do) \$CumulativeProb = 0; if (\$word =~ /([^\$vowels]*)([\$vowels])([^\$vowels]*)/) { \$onset = \$1; \$nucleus = \$2; \$coda = \$3; if (\$onset eq "") { \$onset = " "; } if (\$nucleus eq "") { \$nucleus = " "; } if (\$coda eq "") { \$coda = " "; } \$SummedPositionalProb = (\$OnsetProb{\$onset} + \$NucleusProb{\$nucleus} + \$CodaProb{\$coda}); \$CumulativePositionalProb += \$SummedPositionalProb; } else { print "Error: can't parse syllable [\$word] into onset, nucleus, and coda\n"; \$SummedPositionalProb = ""; \$errors++; } # Write the results to a file printf PREDICTIONSFILE "\$line\t\$SummedPositionalProb\n"; } printf PREDICTIONSFILE "\nAVERAGE:\t" . \$CumulativePositionalProb / (\$number_of_items - \$errors) . "\n"; close (PREDICTIONSFILE); } sub log10 { my \$n = @_[0]; return log(\$n) / log(10); } sub remove_digraphs { \$string = @_[0]; # Some digraphs indicate length redundantly on tense vowels; removing # the colon won't result in any neutralizations \$string =~ s/i:/i/g; \$string =~ s/A:/A/g; \$string =~ s/u:/u/g; \$string =~ s/3:/3/g; \$string =~ s/A~:/Ì/g; \$string =~ s/O~:/¯/g; # Some tense vowels have lax correspondents with the same symbol; # have to change \$string =~ s/O:/¿/g; # Nasalized short and long ¾ doesn't even seem like a real distinction; # I'm going to neutralize them \$string =~ s/&~(:)/®/g; \$string =~ s/eI/Ž/g; \$string =~ s/aI/Y/g; \$string =~ s/OI/š/g; \$string =~ s/aU/W/g; \$string =~ s/\@U/o/g; # The following usually correspond to r in American English \$string =~ s/I\@/•/g; \$string =~ s/E\@/‘/g; \$string =~ s/U\@/Ÿ/g; # Also some consonant digraphs \$string =~ s/dZ/J/g; \$string =~ s/tS/C/g; \$string =~ s/n,/B/g; # totally arbitrary; N is taken \$string =~ s/m,/M/g; \$string =~ s/l,/L/g; \$string =~ s/r\*/R/g; return \$string; } sub replace_digraphs { \$string = @_[0]; # Some digraphs indicate length redundantly on tense vowels; removing # the colon won't result in any neutralizations \$string =~ s/i/i:/g; \$string =~ s/A/A:/g; \$string =~ s/u/u:/g; \$string =~ s/3/3:/g; \$string =~ s/Ì/A~:/g; \$string =~ s/¯/O~:/g; # Some tense vowels have lax correspondents with the same symbol; # have to change \$string =~ s/¿/O:/g; # Nasalized short and long ¾ doesn't even seem like a real distinction; # I'm going to neutralize them \$string =~ s/®/&~/g; \$string =~ s/Ž/eI/g; \$string =~ s/Y/aI/g; \$string =~ s/š/OI/g; \$string =~ s/W/aU/g; \$string =~ s/o/\@U/g; \$string =~ s/•/I\@/g; \$string =~ s/‘/E\@/g; \$string =~ s/Ÿ/U\@/g; # Also some consonant digraphs \$string =~ s/J/dZ/g; \$string =~ s/C/tS/g; \$string =~ s/B/n,/g; # totally arbitrary; N is taken \$string =~ s/M/m,/g; \$string =~ s/L/l,/g; \$string =~ s/R/r\*/g; return \$string; }