# italian.pl # Same as hepburn8.pl, but for italian files # (Modified a bit to handle more complex changes involving backreferences) $number_of_trials = 1; $input_file = "Italian-ToConvert.txt"; open (INFILE, $input_file) or die "Warning! Can't open input file: $!\n"; $check_file = "Italian-Converted.txt"; open (CHECKFILE, $check_file) or die "Warning! Can't open check file: $!\n"; $rules_file = "ItalianRules.txt"; open (RULESFILE, $rules_file) or die "Warning! Can't open rule file: $!\n"; open (LOGFILE, ">italian3.log") or die "Warning! Can't create log file: $!\n"; # Read in the file and store each line in the rules array of arrays while ($line = ) { chomp($line); unless ($line =~ /^\s*$/) { # skip lines that are just whitespace ($orthographic, $phonemic, @rest) = split("\t", $line); # Now, place this pair onto the end of the @rules array push(@rules, [ $orthographic, $phonemic ]); } } $number_of_rules = $#rules; # Now read in the input forms, and store them so we can learn from them while ($line = ) { chomp($line); push (@inputs, $line); $check_line = ; chomp($check_line); push (@answers, $check_line); } for ($t = 1; $t <= $number_of_trials; $t++) { # For each trial, we start at the start state and try solving it again # Rather than manipulating the rules array directly, we'll just keep track of # the currently hypothesized order, using an "indexing array" # We start by assuming that we'll use the rules in the given order # (that is, the initial ordering is 1, 2, 3, 4, ...) for (my $i = 0; $i <= $number_of_rules; $i++) { $ordering[$i] = $i; } $known_orderings = undef; $explored_orderings = undef; # Start by checking the accuracy of the grammar using the initial ordering: $current_number_correct = check_accuracy(@ordering); $iterations = 0; while ($current_number_correct != ($#inputs + 1)) { $iterations++; # We try to improve performance by swapping rules that might feed or bleed each other # There are various ways to explore the space of possible orderings, # but one way is to start at the end of the grammar and try to "promote" rules # by moving them up in the grammar $swap1 = -1; $swap2 = -1; FINDSWAPPAIR: for (my $r1 = $#rules; $r1 >= 1; $r1--) { for (my $r2 = $r1 - 1; $r2 >= 0; $r2--) { # Check if we already know that r2 crucially precedes r1 next if ($known_orderings =~ /$ordering[$r2]<$ordering[$r1]/ or $explored_orderings =~ /$ordering[$r2]<$ordering[$r1]/ or $explored_orderings =~ /$ordering[$r1]<$ordering[$r2]/ ); # Now check if moving r1 before r2 could possibly change things # There are three possibilities: # (1) r1 and r2 have overlapping structural descriptions (bleeding/counterbleeding) # (2) struc desc of r1 overlaps with output of r2 (currently feeding, try counterfeeding) # (3) struc desc of r2 overlaps with output of r1 (currently counterfeeding, try feeding) # The struc descs are: $rules[$ordering[$r1]][0] and $rules[$ordering[$r2]][0] # "Overlapping" means that at least one of their "terms" could match the same segment # (a "term" in this simplified conception is an expression that refers to a single segment) if (overlap($rules[$ordering[$r1]][0], $rules[$ordering[$r2]][0])) { print "Rules $ordering[$r1] and $ordering[$r2] have overlapping struc descs (potential for bleeding/counterbleeding)\n"; print "\t\t[$rules[$ordering[$r1]][0]] and [$rules[$ordering[$r2]][0]]\n"; ($swap1, $swap2) = ($r1, $r2); } elsif (overlap($rules[$ordering[$r2]][0], $rules[$ordering[$r1]][1])) { print "Rule $ordering[$r2] operates on the output of $ordering[$r1] (currently counterfeeding; trying feeding)\n"; print "\t\t[$rules[$ordering[$r2]][0]] and [$rules[$ordering[$r1]][1]]\n"; ($swap1, $swap2) = ($r1, $r2); } elsif (overlap($rules[$ordering[$r1]][0], $rules[$ordering[$r2]][1])) { print "Rule $ordering[$r1] operates on the output of $ordering[$r2] (currently feeding; trying counterfeeding)\n"; print "\t\t[$rules[$ordering[$r1]][0]] and [$rules[$ordering[$r2]][1]]\n"; ($swap1, $swap2) = ($r1, $r2); } if ($swap1 >= 0) { last FINDSWAPPAIR; } } } # If we got through the FINDSWAPPAIR block and didn't find any eligible swaps, then we're stuck if ($swap1 == -1) { print "****It appears that there is no ordering of these rules that will correctly derive the data****\n"; check_accuracy(@ordering); exit; } else { # Try this swap print "\tAttempting to move rule $ordering[$swap1] before $ordering[$swap2]\n"; @proposed_ordering = (@ordering[0..$swap2-1],$ordering[$swap1],@ordering[$swap2..$swap1-1],@ordering[$swap1+1..$number_of_rules]); print "Proposed ordering: @proposed_ordering\n"; $new_number_correct = check_accuracy(@proposed_ordering); if ($new_number_correct > $current_number_correct) { # Aha, this helped! A new crucial ordering # Save the new order @ordering = @proposed_ordering; $current_number_correct = $new_number_correct; $known_orderings .= "$ordering[$swap1]<$ordering[$swap2] "; $explored_orderings = undef; print "\tNew order is better: rule $proposed_ordering[$swap1] must come before rule $proposed_ordering[$swap2]\n"; print "\t\t$rules[$ordering[$swap1]][0] -> $rules[$ordering[$swap1]][1] PRECEDES "; print "$rules[$ordering[$swap2]][0] -> $rules[$ordering[$swap2]][1]\n"; print "\tKnown orderings: $known_orderings\n"; print "\tCurrent order: @ordering\n"; } elsif ($new_number_correct < $current_number_correct) { # A detrimental move; make sure we don't do it again (at least until something else has changed) $explored_orderings .= "$ordering[$swap2]<$ordering[$swap1] "; print "\tNew order is worse: rule $ordering[$swap2] probably comes before rule $ordering[$swap1]\n"; } # It's also possible that the move made no difference, and we shouldn't try it again. # However, I think it may also be possible that something which seems not to make a difference at # the moment might turn out to be crucial; So, we should keep track of the irrelevant_orderings at the moment, # but keep in mind that if we change any other ordering, then we might as well re-check orderings # that were previously irrelevant (by forgetting that they were irrelevant) else { print "\tNew order makes no difference; leaving things as they were\n\n"; $explored_orderings .= "$ordering[$swap2]<$ordering[$swap1] "; } } } $total_iterations += $iterations; print "Trial $t took $iterations iterations\n"; printf LOGFILE "Trial $t took $iterations iterations\n"; } # Now that we're done, the average iterations is the total over the number of trials $average_iterations = $total_iterations / $number_of_trials; printf "\nAfter $number_of_trials trials, the average solution time is %.2f iterations\n", $average_iterations; sub check_accuracy { my @check_ordering = @_; my $correct = 0; for ($i = 0; $i <= $#inputs; $i++) { # We'll start with the current input, and transform it $output = $inputs[$i]; for ($r = 0; $r <= $number_of_rules; $r++) { # Nab the backreferences if ($output =~ /$rules[$check_ordering[$r]][0]/) { @backrefs = ($output =~ /$rules[$check_ordering[$r]][0]/); } # Now do the replacement, putting in the "dummy" backref $1, etc. $output =~ s/$rules[$check_ordering[$r]][0]/$rules[$check_ordering[$r]][1]/g; while ($output =~ /\$(\d+)/) { $replacement = $backrefs['$1']; $output =~ s/\$$1/$replacement/; } } # Now check answer against the "real" answer in the checkfile if ($output eq $answers[$i]) { $correct++; } else { print "\tIncorrect on form $i ($inputs[$i]: deriving [$output] instead of [$answers[$i]])\n"; } } return $correct; } sub overlap { my $string1 = @_[0]; my $string2 = @_[1]; if (length($string1) > length($string2) ) { return ($string1 =~ /\Q$string2\E/); } else { return ($string2 =~ /\Q$string1\E/); } }