#!/usr/bin/perl

# Usage: randsent [-t] [-T] [-n num-sentences] [-s start-symbol] grammarfiles...
# Type "randsent -?" for details.
#
# Author: Jason Eisner <jason@cs.jhu.edu>, 2001-09-08.
#
# Modifications by Noah A. Smith <nasmith@cs.jhu.edu> 6/17/02-6/30/02.
#  + Ignores lexicalizations in [].
#  + Can specify start symbol on command line.
#  + Logs generated "S1"s.
#     - to a new file each time!
#  + Can generate multiple sentences on one call.
#  + Grammar files given on command line (not piped in)
#  + Calls check-for-new-terms which verifies that no terminal vocab
#    items have been added.
#
# Modifications by Jason Eisner <jason@cs.jhu.edu> 1/16/02.
#  + Removed Noah's modifications that were only for purpose of WS02 lab:
#    - Changed default top symbol from S1 back to START.
#    - Turned off sentence logging via $log (but left the code in).
#    - Removed redundant & undocumented sentence logging via $sentfile.
#    - Removed call to check-for-new-terms.
#  + Added -T option for automatic prettyprinting.
#  + Improved usage message slightly.
#  + Usage message will be printed if arguments are bad.

use strict 'vars';

# Process options.

my($usage, $tree, $pretty, $N, $topsym, $log);

$usage = "randsent [-t] [-T] [-n num-sentences] [-s start-symbol] grammarfiles...

-t: output full parse trees
-T: output full parse trees, prettyprinted
-n: specify how many sentences to generate (default 1)
-s: specify the start symbol (default START)

Grammar files are concatenated; if none, grammar is read from stdin.
For grammar file format and examples, see
   http://www.cs.jhu.edu/~jason/465/hw1/hw1.pdf\n";

$tree = 0;
$pretty = 0;
$N = 1;
$topsym = "START";
$log = 0;    # if turned on, will keep a record of the generated sentences (not trees) in a hidden file
while (@ARGV and $ARGV[0] =~ /^\-./) {   
  my $flag = shift(@ARGV);
  if ($flag eq "-t") { $tree = 1 }
  elsif ($flag eq "-T") { $tree = 1; $pretty = 1 }
  elsif ($flag eq "-n") { defined($N = shift(@ARGV)) || die $usage }
  elsif ($flag eq "-s") { defined($topsym = shift(@ARGV)) || die$usage }
  else { die $usage }   # bad flag, including -help or -?
}

open(LOG, ">.randsentlog.$$") if($log);

# Read grammar.

my(%rules, %totalweight);
while (<>) {
  print C;
  s/\#.*//;            # kill comments (including end-of-line comments)
    s/\[.*?\]//g;     # kill lexicalizations (NAS 6/17/02)
  next unless /\S/;   # skip blank lines
  my(@rule)=split;
  die "Weights must be positive" unless $rule[0] > 0;
  push(@{$rules{$rule[1]}}, \@rule);    # add to list of rules whose LHS=$rule[1]
  $totalweight{$rule[1]} += $rule[0];   # each LHS maintains total weight of rules on its list
}
close C;

# Generate a single sentence.
# Maybe pipe it through prettyprint, depending on command-line opts.

if ($tree && $pretty) {
  open(OUT, "| prettyprint") || open(OUT, "| ./prettyprint") || die "$0: Couldn't open pipe to \"prettyprint\" script (for -T option)\n";
} else {
  open(OUT);
}
die "Error:  $topsym not in the grammar.\n" unless defined $rules{$topsym};

while($N > 0){
  printrand($topsym);
  print OUT "\n";
  print LOG "\n" if($log);
  $N--;
}
close(OUT);
if ($log){
  close(LOG);
  system("chmod a+r .randsentlog.$$");
}

######################################################################

# printrand("FOO") prints a random FOO-rooted sentence (or tree, if $tree)
# to OUT.

sub printrand {      
  my ($root) = @_; 
  if (!defined $rules{$root}) {     # terminal symbol
    print OUT $root, " ";
    print LOG $root, " " if($log);
  } else {                          # nonterminal symbol
    my(@rules) = @{$rules{$root}};            # possible rewrites    
    my($r) = rand(0) * $totalweight{$root};   # choose random number $r
    my(@rule);
    die "oops" unless $r >= 0;
    while ($r >= 0) {                         # pick rewrite matching $r   
      die "oops" unless @rules;
      @rule = @{pop(@rules)};
      $r -= shift(@rule);
    }
    die "oops" unless $root eq shift(@rule);  
    print OUT "($root " if $tree;             # print tree structure
    map printrand($_), @rule;                 # recurse to kids        
    print OUT ")" if $tree;                   # print tree structure   
  }
}