#!/usr/bin/perl -w
use strict;

=head1 NAME

extract_pdf_fonts.pl - extract fonts embedded in a pdf file

=head1 SYNOPSIS

  extract_pdf_fonts.pl file.pdf [extract_directory]

=head1 DESCRIPTION

Relies on pdffonts (from xpdf) and pdftosrc (from pdftex).

=head1 CAVEATS

I was unable to use the resulting pfb file made from an extracted cff file,
even though it was not subsetted. Dunno why.

=head1 TODO

Could also get font info from Multivalent:

  java -cp Multivalent.jar tool.pdf.Info -fonts -verbose file.pdf 

And Multivalent is also able to interpret embedded fonts, in 
its java viewer:

  java -cp Multivalent.jar tool.font.View -pdf file.pdf

Note that the Multivalent tools are not open source.

=head1 SEE ALSO

I got all this info from:

   Mirko Scholz http://www.polbox.com/g/gnnggb/jlhgonx.html
   http://www.searchlores.org/son_font.htm
   http://groups.google.com/group/comp.text.pdf/msg/2b34cb4b707d8189?hl=en&

Tools at http://www.lcdf.org/~eddietwo/type/
may convert.

=cut

use Data::Dumper;

our $DEBUG = $ENV{DEBUG};

sub debug {
    print STDERR 'DEBUG: ', @_, "\n" if $DEBUG;
}

sub info {
    print STDERR "INFO: ", @_, "\n";
}

sub warning {
    print STDERR "WARNING: ", @_, "\n";
}

# an .otf OpenType Font can be either a PostScript outline (cff) or TrueType outline (ttf)

my %FONT_TYPES = (
	  # Adobe PostScript Type 1, binary format (pfa is ascii format)
		  # PFB = Printer Font Binary, PFA = Printer Font ASCII
    'Type 1' => 'pfb',
	  # http://partners.adobe.com/public/developer/opentype/index_cff2.html
	  # compact. Adobe Type 2, and OpenType Type 1 (got that?)
		  # similar to Type 1.
		  # can convert to pfa or pfb with cfftot1 http://www.lcdf.org/~eddietwo/type/cfftot1.1.html
    'Type 1C' => 'cff',
		  # full power of PostScript. Note that "Type 42" is a PS wrapper around TrueType
    'Type 3'=>'pf3',

		  # Apple TrueType, jointly developed w/Microsoft when they hated Adobe.
		  # Later Microsoft and Adobe came out with OpenType = either CFF or TrueType
		  # TrueType has quadratic (third order) bezier instead of PostScript cubic (second order) bezier.
    'TrueType'=>'ttf',

		  # "Type 0" are composite fonts
		  # CID = "Character IDentifier"
		  # CID are for CJK, Type 0 can contain 65k glyphs
    'CID Type 0'=>'cidtype0',
    'CID Type 0C'=>'cff',
    'CID TrueType'=>'ttf'
);

# ((?:CID )?(?:True)?Type(?: \d\w?)?)
my $FONT_TYPE_RE = join('|', keys %FONT_TYPES);

sub get_font_info {
    my ($f) = @_;
    my @lines = `pdffonts $f`;
    die "pdffonts $f failed" unless @lines;
    # shift off header
    shift @lines; shift @lines;
    my $rows = [];
    for (@lines) {
	chomp;
	my ($name, $type, $rest) = m/^(.*?)\s+($FONT_TYPE_RE)\s+(.*)$/;
	warning("can't parse pdffonts line: $_"), next unless $rest;
	my ($emb, $sub, $uni, $objid, $gen) = split(/\s+/, $rest);
	warning("can't parse rest of pdffonts line: '$rest'") unless defined($gen);
	push(@$rows, {
	    name => $name, 
	    type => $type, 
	    embed => $emb, 
	    subset => $sub, 
	    unicode => $uni, 
	    objectid => $objid, 
	    generation => $gen,
	    });
    }
    return $rows;
}

sub extract_fonts {
    my ($f, $d, $font_rows) = @_;
    mkdir $d;
    my ($base) = ($f =~ m/(.*)\./); $base ||= $f;
    for my $fi (@$font_rows) {
	my $name = $fi->{name};
	if ($fi->{embed} ne 'yes') {
	    warning("skipping '$name' because embedded=", $fi->{embed});
	    next;
	}
	info("extracting font '$name'");
	# either pdffonts or pdftosrc is wrong
	my $objid = $fi->{objectid} + 2;
	my $gen = $fi->{generation};
	my $cmd = "pdftosrc $f $objid $gen";
	debug("running: $cmd");
	my $out = `$cmd 2>&1`; 
	warning("command '$cmd' returned non-zero $?, code ", $?>>8, ", output: '$out'"), next if $?;

	# no control over output file of pdftosrc, we have to move it
	my $pdftosrc_out = "$base.$objid" . ($gen ? "+$gen" : '');
	my $suffix = $FONT_TYPES{$fi->{type}} || die("unknown font type '", $fi->{type}, "'");
	my $font_file = "$d/$name.$suffix";
	if (!rename($pdftosrc_out, $font_file)) {
	    warning("failed to move '$pdftosrc_out' to '$font_file'"); next;
	}
	info("extracted font '$font_file'");
    }
}

# extracts them all
sub extract_all {
    my ($f) = @_;
    # get largest object id 
    my $s = `grep Size $f | tail -1`;
    my ($highest) = ($s =~ m/(\d+)/);
    for my $i (0..$highest) {
	system("pdftosrc $f $i");
    }
}

sub main {
    die("Usage: $0 file.pdf [directory]") unless @ARGV >= 1;
    my ($f, $d) = @ARGV;
    $d ||= '.';

    my $font_rows = get_font_info($f);
    debug("extracted: ", Dumper($font_rows));

    extract_fonts($f, $d, $font_rows);
}

main();


