#!/usr/local/bin/perl -w
use strict;

=head1 DESCRIPTION

Takes a pdf and dumps out XML.

=cut

use IO::String;
use lib '/home/mdaoh/downloads/Text-PDF-0.25/lib';
use Text::PDF::File;
use Data::Dumper;

our $DEBUG = $ENV{DEBUG};
sub debug {
    print STDERR 'DEBUG: ', @_, "\n" if $DEBUG;
}
sub warning {
    print STDERR 'WARNING: ', @_, "\n";
}

################################################################
# See Appendix A
# TODO: add a group, argument info, which ones affect state
# maybe just grab whole description from Table 4.10 and others from A.1? 
our %OPERATOR_INFO =
(
 b => ['closepath_fill_close'],
 B => 'fill_stroke',
 'b*' => 'closepath_eofill_stroke',
 'B*' => 'eofill_stroke',

 BDC => 'begin_marked_content_properties',
 BI => 'begin_image',
 BMC => 'begin_marked_content',
 BT => 'begin_text_object',
 BX => 'begin_compatibility',

 c => 'curveto',
 cm => 'concat',
 CS => 'setcolorspace',
 cs => 'setcolorspace',
 );

our @KNOWN_OPERATORS =
(
 # general graphics state
 qw(w J j M d ri i gs),
 # specific graphics state
 qw(q Q cm),
 # path construction
 qw(m l c v y h re),
 # Path painting 
 qw(S s f F f* B B* b b* n),
 # Clipping paths 
 qw(W W* 4.11 205), 
 # Text objects
  qw(BT ET),
 # Text state
  qw(Tc Tw Tz TL Tf Tr Ts),
 # Text positioning
  qw(Td TD Tm T*),
 # Text showing
 qw(Tj TJ ' "),
 # Type 3 fonts 
 qw(d0 d1),
 #Color 
 qw(CS cs SC SCN sc scn G g RG rg K k),
 # Shading patterns
  qw(sh),
  # Inline images
  qw(BI ID EI),
  # XObjects
  qw(Do),
  # Marked content
  qw(MP DP BMC BDC EMC),
  # Compatibility
   qw(BX EX), 
);
our %KNOWN_OPERATORS = map {$_=>1} @KNOWN_OPERATORS;
 
our @NON_OPERATORS = qw(null true false);
our %NON_OPERATORS = map {$_=>1} @NON_OPERATORS;

# From Table A.1 and Table 4.1 from PDF 1.6 specification.
# Not currently used
our $OPERATORS = 
[
 # operator, PostScript equivalent, Short Description, Detail Table, Category, 
 # Operands, Long Description

 ['BX', '', '(PDF 1.1) Begin compatibility section', '3.29', 'Compatibility',
  ],
 ['EX', '', '(PDF 1.1) End compatibility section', '3.29', 'Compatibility',
  ],


 ['cm', 'concat', 'Concatenate matrix to current transformation matrix',  '4.7', 'Special graphics state',
  ],
 ['q', 'gsave', 'Save graphics state', '4.7', 'Special graphics state',
  ],
 ['Q', 'grestore', 'Restore graphics state', '4.7', 'Special graphics state',
  ],

 ['d', 'setdash', 'Set line dash pattern', '4.7', 'General graphics state',
  ],
 ['gs', '', '(PDF 1.2) Set parameters from graphics state parameter dictionary', '4.7',
  ],
 ['i', 'setflat', 'Set flatness tolerance', '4.7', 'General graphics state',
  ],
  ['j', 'setlinejoin', 'Set line join style', '4.7', 'General graphics state',
   ],
 ['J', 'setlinecap', 'Set line cap style', '4.7', 'General graphics state',
  ],
 ['M', 'setmiterlimit', 'Set miter limit', '4.7', 'General graphics state',
  ],
 ['ri', '', 'Set color rendering intent', '4.7', 'General graphics state',
  ],
 ['w', 'setlinewidth', 'Set line width', '4.7', 'General graphics state',
  ],

  ['c', 'curveto', 'Append curved segment to path (three control points)', '4.9', 'Path construction',
   ],
  ['m', 'moveto', 'Begin new subpath', '4.9', 'Path construction',
   ],
   ['l', 'lineto', 'Append straight line segment to path', '4.9', 'Path construction',
    ],
 ['re', '', 'Append rectangle to path', '4.9', 'Path construction',
  ],
#h closepath Close subpath 4.9
#v curveto Append curved segment to path (initial point replicated) 4.9
#y curveto Append curved segment to path (final point replicated) 4.9

 ['b', 'closepath, fill, stroke', "Close, fill, and stroke path using nonzero winding number rule.", '4.10', 'Path painting',
  [],
  "Close, fill, and then stroke the path, using the nonzero winding number rule to determine the region to fill. This operator has the same effect as the sequence h B.",
  ],
 ['b*', 'closepath, eofill, stroke', "Close, fill, and stroke path using even-odd rule.", '4.10', 'Path painting',
  [],
  "Close, fill, and then stroke the path, using the even-odd rule to determine the region to fill. This operator has the same effect as the sequence h B*." ,
  ],
 ['B', 'fill, stroke', 'Fill and stroke path using nonzero winding number rule', '4.10', 'Path painting',
  [],
  "Fill and then stroke the path, using the nonzero winding number rule to determine the region to fill. This operator produces the same result as constructing two identical path objects, painting the first with f and the second with S. Note, however, that the filling and stroking portions of the operation consult different values of several graphics state parameters, such as the current color.", 
  ],
 ['B*', 'eofill, stroke', 'Fill and stroke path using even-odd rule.', '4.10', 'Path painting',
  [],
  "Fill and then stroke the path, using the even-odd rule to determine the region to fill. This operator produces the same result as B, except that the path is filled as if with f* instead of f.", 
  ],
 ['f', 'fill', 'Fill path using nonzero winding number rule', '4.10', 'Path painting',
  ],
 ['F', 'fill', 'Fill path using nonzero winding number rule (obsolete)', '4.10', 'Path painting',
  ],
 ['f*', 'eofill', 'Fill path using even-odd rule', '4.10', 'Path painting',
  ],
#n End path without filling or stroking 4.10
#s closepath, stroke Close and stroke path 4.10
#S stroke Stroke path 4.10


 ['W', 'clip', 'Set clipping path using nonzero winding number rule', '4.11', 'Clipping paths',
  ],
#W* eoclip Set clipping path using even-odd rule 4.11

 ['g', 'setgray', 'Set gray level for nonstroking operations', '4.24', 'Color',
  ],
 ['CS', 'setcolorspace', '(PDF 1.1) Set color space for stroking operations', '4.24', 'Color',
  ],
 ['cs', 'setcolorspace', '(PDF 1.1) Set color space for nonstroking operations', '4.24', 'Color',
  ],
#G setgray Set gray level for stroking operations 4.24
#K setcmykcolor Set CMYK color for stroking operations 4.24
#k setcmykcolor Set CMYK color for nonstroking operations 4.24
#RG setrgbcolor Set RGB color for stroking operations 4.24
#rg setrgbcolor Set RGB color for nonstroking operations 4.24
#SC setcolor (PDF 1.1) Set color for stroking operations 4.24
#sc setcolor (PDF 1.1) Set color for nonstroking operations 4.24
#SCN setcolor (PDF 1.2) Set color for stroking operations (ICCBased and special color spaces) 4.24
#scn setcolor (PDF 1.2) Set color for nonstroking operations (ICCBased and special color spaces) 4.24

 ['sh', 'shfill', '(PDF 1.3) Paint area defined by shading pattern', '4.27', 'Shading patterns',
  ],

 ['Do', '', 'Invoke named XObject', '4.37', 'XObjects',
  ],


 ['BI', '', 'Begin inline image object', '4.42', 'Inline images',
  [],
  "Begin an inline image object.", 
  ],
 ['EI', '', 'End inline image object', '4.42', 'Inline images',
  ],
 ['ID', '', 'Begin inline image data', '4.42', 'Inline images',
  ],

 ['Tc', '', 'Set character spacing', '5.2', 'Text state',
  ],
 ['Tf', 'selectfont', 'Set text font and size', '5.2', 'Text state',
  ],
 ['TL', '', 'Set text leading', '5.2', 'Text state',
  ],
 ['Tr', '', 'Set text rendering mode', '5.2', 'Text state',
  ],
 ['Ts', '', 'Set text rise', '5.2', 'Text state',
  ],
 ['Tw', '', 'Set word spacing', '5.2', 'Text state',
  ],
 ['Tz', '', 'Set horizontal text scaling', '5.2', 'Text state',
  ],

 ['BT', '', 'Begin text object', '5.4', 'Text objects',
 [],
  "Begin a text object, initializing the text matrix, T m, and the text line matrix, Tlm, to the identity matrix. Text objects cannot be nested; a second BT cannot appear before an ET.", 
  ],
 ['ET', '', 'End text object', '5.4', 'Text objects',
  ],

 ['T*', '', 'Move to start of next text line', '5.5'
  ],
 ['Td', '', 'Move text position', '5.5', 'Text positioning',
  ],
 ['Tm', '', 'Set text matrix and text line matrix', '5.5', 'Text positioning',
  ],
 ['TD', '', 'Move text position and set leading', '5.5', 'Text positioning',
  ],

 ['Tj', 'show', 'Show text', '5.6', 'Text showing',
  ],
 ['TJ', '', 'Show text, allowing individual glyph positioning', '5.6', 'Text showing',
  ],
 ['\'', '', 'Move to next line and show text', '5.6', 'Text showing',
  ],
 ["\"", '', 'Set word and character spacing, move to next line, and show text', '5.6', 'Text showing',
  ],


 ['d0', 'setcharwidth', 'Set glyph width in Type 3 font', '5.10', 'Type 3 fonts',
  ],
 ['d1', 'setcachedevice', 'Set glyph width and bounding box in Type 3 font', '5.10', 'Type 3 fonts',
  ],

 ['BDC', '', '(PDF 1.2) Begin marked-content sequence with property list.', '10.7', 'Marked content',
  'tag properties',
  "Begin a marked-content sequence with an associated property list, terminated by a balancing EMC operator. tag is a name object indicating the role or significance of the sequence. properties is either an inline dictionary containing the property list or a name object associated with it in the Properties subdictionary of the current resource dictionary.",
  ],
 ['BMC', '', '(PDF 1.2) Begin marked-content sequence', '10.7', 'Marked content',
  'tag',
  "Begin a marked-content sequence terminated by a balancing EMC operator. tag is a name object indicating the role or significance of the sequence." ,
  ],
 ['DP', '', '(PDF 1.2) Define marked-content point with property list', '10.7', 'Marked content',
  ],
 ['EMC', '', '(PDF 1.2) End marked-content sequence', '10.7', 'Marked content',
  ],
#MP (PDF 1.2) Define marked-content point 10.7


 ];


################################################################

our $WARN_DUP_WRITES;

# locally bound in walk_obj to indicate whether we are in a content stream
our $IN_CONTENTS;

################################################################

# modification of PDF-API2-0.46.003/contrib/pdf-optimize.pl
# to also work with Text::PDF and not require copying
sub walk_obj {
    my ($objs, $pdf, $filter, $obj, @objkeys) = @_;

    $obj->realise() # if(ref($obj)=~/Objind$/)
	;

    if(defined $objs->{scalar $obj}) {
	warning("skipping already written ", scalar $obj) if $WARN_DUP_WRITES;
	return($objs->{scalar $obj});
    }

  die "object already copied" if(   $obj->{' copied'});
#    my $objcopy = $obj->copy($pdf);
    $obj->{' copied'}=1;
  $objs->{scalar $obj}=$obj;

    # array
    if(ref($obj)=~/Array$/ || (UNIVERSAL::can($obj,'isa') && ($obj->isa('PDF::API2::Basic::PDF::Array') || $obj->isa('Text::PDF::Array')))) {
        $filter->start('array', $obj);
	my $count = 0;
	my @children = $obj->elementsof();
	debug "about to walk over array $obj with ", scalar(@children), " children";
        foreach my $child (@children) {
            $child->realise() # if(ref($child)=~/Objind$/)
		;
	    $filter->start('item', $child, $count);
	    my $recurse = walk_obj($objs, $pdf, $filter, $child);
            $filter->end('item', $recurse, $count);
        }
	$filter->end('array', $obj);
    } 

    # dict
    elsif(ref($obj)=~/Dict$/ || (UNIVERSAL::can($obj,'isa') && ($obj->isa('PDF::API2::Basic::PDF::Dict') ||$obj->isa('Text::PDF::Dict'))) ) {
	if ($obj->isa('Text::PDF::Pages')) {
	    if ($obj->{Kids}) {
		my @kids = $obj->get_kids() ;
		debug "got ", scalar(@kids), " kids for Pages";
	    }
	    else {
		debug "NO Kids in $obj";
	    }
	}
        $filter->start('dict', $obj);
	# debug "about to do dict: ", Dumper($obj), Dumper($objcopy);
	if (scalar @objkeys <1) {
	    @objkeys = grep {$_ !~ m/^ /} keys(%{$obj});
	    # debug "got ", scalar(@objkeys), " keys: ", Dumper([@objkeys]);
	}
        foreach my $k (@objkeys) {
	    my $child = $obj->{$k};
	    if (!defined($child)) {
		warning "SKIPPING MISSING CHILD '$k'";
		next;
	    }
	    if ($k eq 'Kids') {
		# debug "about to walk Kids: ", Dumper($child);
		my @child_els = $child->elementsof();
		debug "about to walk Kids $child with ", scalar(@child_els), " children";
		$child->realise();
	    }
	    $filter->start('entry', $obj, $k);
	    local $IN_CONTENTS = ($k eq 'Contents');
	    my $recurse = walk_obj($objs, $pdf, $filter, $child);
            $filter->end('entry', $recurse, $k);
        }

	if (defined $obj->{' stream'} or defined $obj->{' streamfile'} or defined $obj->{' streamloc'}) { 
	    for my $k (qw(stream streamfile streamloc)) {
		my $v = $obj->{" $k"};
		debug "$k=", (defined($v) ? ($k eq 'stream' ? 'defined' : "$v") : 'undefined'); 
	    }
	    if (!defined $obj->{' stream'}) {
		debug("stream not already read, so reading it");
		$obj->read_stream() 
	    }

	    debug "filters are: ", Dumper($obj->{Filter});

	    # if ' nofilt' is true, then ' stream' holds exact bytes from input file
	    debug "nofilt=", $obj->{' nofilt'};
	    if ($IN_CONTENTS) {
		if ($obj->{' nofilt'}) {
		    if (0) {
			debug("stream read and filtered, so reading it again");
			# not all filters are implemented, such as DCTDecode
			$obj->read_stream();
		    }
		    if (unfilter($obj)) {
			# $obj->{' stream'} =~ s/\015\012?/\n/g;
		    }
		    else {die "couldn't unfilter Contents"}
		}
		my $instructions = parse_content_stream($obj->{' stream'});
		$filter->start('contentstream');
		for my $inst (@$instructions) {
		    my ($operator, @operands) = @$inst;
		    $filter->start('inst', undef, $operator);
		    for my $operand (@operands) {
			my $recurse = walk_obj($objs, $pdf, $filter, $operand);
		    }
		    $filter->end('inst');
		}
		$filter->end('contentstream');
	    }
	    else {
		if ($obj->{' nofilt'} && $obj->{Filter}) {
		    warning "couldn't unfilter stream with filters: ", Dumper($obj->{Filter}) unless unfilter($obj);
		}
		$filter->streambytes($obj->{' stream'}, $obj);
	    }
        }
        $filter->end('dict', $obj);
    }

    # otherwise
    else {
	$filter->atom($obj);
    }

    return($obj);
}


sub parse_content_stream {
    my ($s) = @_;
    $s =~ s/\015\012?/\n/g;
    # create a File to parse just this string, to fool readval
    my $pdf = Text::PDF::File->new();
    if (0) {
	my $io = IO::String->new();
	$io->open($s);
	$pdf->{' INFILE'} = $io;
    }
    else {
	$pdf->{' INFILE'} = IO::String->new();
    }
    my $rest = $s;
    my $obj;
    my @instructions = ();
    my @operands = ();
    while($rest) {
	# we have to exclude some cases of word tokens starting with letters: null true false
	if ($rest =~ m/^\s*([A-Z\*\'\"]+)\s*(.*)/is && !$NON_OPERATORS{$1}) {
	    my $operator = $1;
	    debug("got operator '$operator'");
	    warning("unknown operator '$operator'") unless $KNOWN_OPERATORS{$operator};
	    $rest = $2;
	    push(@instructions, [$operator, @operands]);
	    @operands = ();
	    next;
	}
	debug("about to read from: ", substr($rest, 0, 10));
	($obj, $rest) = $pdf->readval($rest);
	push(@operands, $obj);
	debug("read $obj");
    }
    return [@instructions];
}

my @FILTERS = qw(ASCII85Decode RunLengthDecode ASCIIHexDecode FlateDecode LZWDecode);
my %FILTERS = map {$_ => 1} @FILTERS;

# from Text-PDF-0.25/lib/Text/PDF/Dict.pm
sub unfilter {
    my ($obj) = @_;
    my $data = $obj->{' stream'};
    my $filters = $obj->{Filter};
    my @filters = $filters ? $filters->elementsof() : ();
    if (! @filters) {
	debug "no filters for stream";
	return 0;
    }
    my @filternames = map {$_->val()} @filters;
    debug("have ", scalar(@filternames), " filters: @filternames");
    my @filterobjs = ();
    foreach my $filtername (@filternames) {
	if (! $FILTERS{$filtername}) {
	    warning("filter '$filtername' (out of @filternames) not implemented");
	    return 0;
	}
	my $modulename = 'Text::PDF::' . $filtername;
	push(@filterobjs, $modulename->new());
    }
    for my $filterobj (@filterobjs) {
	debug("applying filter $filterobj");
	$data = $filterobj->infilt($data, 1);
    }
    if (1) {
	$obj->{' nofilt'} = 0;
	$obj->{' stream'} = $data;
    }
    return 1;
}
    

################################################################

our $PDF;
sub main {
    my ($filename) = @_;
    my $p = Text::PDF::File->open($filename, 0) || die "can't open '$filename'";
    local $PDF = $p;
    my $objects = [];
    # because copy calls free_obj even if $filter returns undef
    my $out = FakePDF->new();
    $p->copy($out, sub {
	my $obj = shift;
	my $level = $obj->{' level'} = ($obj->{' parent'}->{' level'}||0) + 1;
	debug "object ", $obj->{' objnum'}, " level $level is: $obj";
	push(@$objects, $obj);
	return undef;
    });
    my $objs = {};
    my $filter = new WalkFilter;
    walk_obj($objs, $p, $filter, $p->{Root});
    walk_obj($objs, $p, $filter, $p->{Info});
    
    # debug "objects: ", Dumper($objects);
}

 
################################################################
package WalkFilter;

sub new {
    bless {
	indent => 0,
    }; 
}

sub indent {
    my ($self) = @_;
    return ' 'x$self->{indent};
}
sub start {
    my ($self, $tagname, $obj, $index_key_or_nothing) = @_;
    return if $tagname eq 'item';
    print $self->indent();
    print "<$tagname";
    print " name=\"$index_key_or_nothing\"" if $tagname eq 'entry' || $tagname eq 'inst';
    if ($obj && $obj->{' uid'} && $tagname ne 'entry') {
	# $obj->is_obj($pdf) sets a uid even if it didn't have one before
	# print ' refnum="', $obj->{' objnum'}, '" refgen="', $obj->{' objgen'}, '"' if $obj->is_obj($::PDF);
	my $numgen = $::PDF->{' objects'}{$obj->uid};
	if ($numgen) {
	    my ($num, $gen) = @$numgen;
	    print " num=\"$num\" gen=\"$gen\"";
	}
	else {
	    ::warning("no numgen for uid ", $obj->{' uid'});
	}
    }
    print ">\n";
    $self->{indent}++;
}
sub end {
    my ($self, $tagname, $obj, $index_key_or_nothing) = @_;
    return if $tagname eq 'item';
    $self->{indent}--;
    print $self->indent(), "</$tagname>\n";
}    
sub streambytes {
    my ($self, $bytes, $obj) = @_;
    print $self->indent(), '<stream>', $bytes, '</stream>', "\n";
}

sub atom {
    my ($self, $obj) = @_;
    my $tagname = ref($obj);
    $tagname =~ s/.*:://;
    my $v = $obj;
    my $atts = '';
    if ($tagname eq 'Number') {
	$v = $obj->as_pdf();
	$tagname = $v =~ m/\./ ? 'f' : 'i';
    }
    elsif ($tagname eq 'String') {
	$tagname = 's';
	$v = $obj->{val};
	# TODO UTF-16BE starting with FEFF for unicode, otherwise it is PDFDocEncoding
	if ($obj->{' isutf'}) {
	    $atts = ' enc="pdfutf"';
	}
	elsif ($obj->{' ishex'}) {
	    $atts = ' enc="pdfhex"';
	}
    }
    elsif ($tagname eq 'Name') {
	$tagname = 'n';
	$v = $obj->{val};
	if (0) {
	    # with leading slash and #-encoding
	    $v = $obj->as_pdf();
	    $atts = ' enc="pdfname"'
	}
    }
    elsif ($tagname eq 'Bool') {
	$tagname = 'b';
	$v = $obj->as_pdf();
    }
    print $self->indent(), "<$tagname$atts>", $v, "</$tagname>", "\n";
}

################################################################
package FakePDF;
sub new {
    return bless {
    };
}
sub free_obj {}

################################################################
package ::;

::main(@ARGV);


