#!/usr/bin/env perl

# Usage: delattrs foo.gra > foo.gr
#
# Creates a "clean" version of the grammar foo.gra that has no attributes,
# comments, or blank lines.  You can use that version for parsing and
# then compute the attribute values later with buildattrs.
#
# Also turns counts into weights.  In foo.gra, the number before each
# rule X -> Y Z is proportional to p(Y Z | X) as in assignment 1.  
# In foo.gr, this number is replaced by -log2 p(Y Z | X), which is
# the "weight" of the rule.

use bytes;

while (<>) {
  chomp;
  $comment = "";
  $comment = $& if s/#.*//; # remove and save any comment on this line
  s/"[^"]*"//g;             # delete any quoted material (even if it contains unbalanced brackets)
  {} while s/\[[^][]*\]//g; # repeatedly remove minimal balanced bracket pairs until all gone.  The outermost of these is the whole attribute spec for a nonterminal.

  @r = split;
  next unless @r;                      # skip blank lines
  $count = shift(@r);                  # get number from rule
  die "invalid count $count" unless $count =~ /[0-9.]+/ && $count > 0;
  $lhs = shift(@r);                    # get left-hand side
  $rule = $lhs . "\t" . join(" ",@r);  # a canonical form
  push @rules, $rule unless defined $rulecount{$rule};   # eliminate duplicate rules (rules that are identical except for attributes)
  $rulecount{$rule} += $count;         # but sum counts of duplicates
  $totalcount{$lhs} += $count;
}

# at the end, dump out all the rules and their weights
foreach $rule (@rules) {
  $rule =~ /^\S+/;           # get LHS into $&
  $prob = $rulecount{$rule} / $totalcount{$&};
  # Used to use weights instead of probs, but that was incompatible with HW1
  # # $weight = -log($prob) / log(2);
  # # $weight = 0 if $weight==0;     # avoid annoying "-0"
  printf "%.3g\t%s\n", $prob, $rule;
}