#!/usr/bin/perl use strict; use warnings; use Data::Dump 'dump'; use List::Util qw(first); use List::MoreUtils qw(pairwise minmax); sub any { $_ && return 1 for @_; 0 } my $filename = "42589_rv630_rrg_1.01o.ps"; my $page; my %fonts = ( "F50683" => 1, "F4486" => 1, "F4487" => 1 ); my %bad_page = ( 86 => 1, 214 => 1, 332 => 1); sub find_col_lines { my ($t, $v) = @_; my $min_y = $t->{left}{y} - 0.5; my $max_y = $t->{left}{y} + $t->{left}{h}; my $min_x = $t->{left}{x}; my $max_x = $t->{right}{x}; my @col_lines = grep { $_->{x} > $min_x and $_->{x} < $max_x and $_->{y} >= $min_y and $_->{y} <= $max_y } @$v; if (@col_lines) { $t->{col_top} = $col_lines[0]{y} + $col_lines[0]{h}; } $t->{col_lines} = [ map $_->{x}, @col_lines ]; } sub find_tables { my ($v, $page_num) = @_; my ($min_x, $max_x) = minmax(map $_->{x}, @$v); my @min = grep $_->{x} == $min_x, @$v; my @max = grep $_->{x} == $max_x, @$v; my $num = 0; my @tables = pairwise {{num=>$num++,left=>$a,right=>$b}} @min, @max; foreach my $t (@tables) { if ($bad_page{$page_num}) { $t->{col_lines} = [ 239.52, 280.02, 329.52 ]; $t->{col_top} = 104.7 + 633.54; } else { find_col_lines($t, $v); } } return [grep $_->{col_top}, @tables]; } sub find_table { my ($tables, $y) = @_; return first { my $l = $_->{left}; $l->{y} < $y and $l->{y} + $l->{h} > $y } @$tables; } sub find_table_col { my ($t, $x, $y) = @_; $y > $t->{col_top} and return "heading"; my $col = 0; foreach (@{$t->{col_lines}}) { last if $_ > $x; $col++; } return $col; } sub parse_page { my $p = shift; my $text = $p->{text}; my @h = grep { $_->{w} > 20 and $_->{h} > 0.2 and $_->{h} < 0.8 } @{$p->{lines}}; my @v = grep { $_->{w} < 0.9 and $_->{h} > 1 } @{$p->{lines}}; @v or return; my $tables = find_tables(\@v, $p->{num}); @$tables or return; $p->{tables} = $tables; my $prev; my $cur_text = ''; my $cur_x; my $cur_font; foreach my $t (@$text) { my $x = $t->{x}; my $y = $t->{y}; undef $cur_x; shift @{$t->{lines}} eq "0 0 Td\n" or die; foreach (@{$t->{lines}}) { if (m{^/(F\d+)_0 1 Tf$}) { $cur_font = $1; } if (m{^\((.+)\) ([\d.]+) Tj$}) { if($fonts{$cur_font} and $1 ne "\\225") { $cur_text .= $1; defined $cur_x or $cur_x = $x; $x += $2 * $t->{mul_x}; next; } $x += $2 * $t->{mul_x}; } if (m{(-?[\d.]+) TJm$} and $fonts{$cur_font}) { my $tjm = -$1 * 0.01; $x += $tjm; if ($tjm < 1) { next; } elsif ($tjm < 6.5) { $cur_text .= ' '; next; } } if (m{^(-?[\d.]+) (-?[\d.]+) Td$}) { $x = $t->{x} + $1 * $t->{mul_x}; my $y2 = $t->{y} + $2 * $t->{mul_y}; next if $y2 == $y; $y = $y2; } if (length $cur_text and $fonts{$cur_font}) { found_text($p, $cur_x, $y, $cur_text); $cur_text = ''; undef $cur_x; } } if ($cur_text and $fonts{$cur_font}) { found_text($p, $cur_x, $y, $cur_text); $cur_text = ''; undef $cur_x; } } return { num => $p->{num}, text => $p->{text2}}; } sub found_text { my ($page, $x, $y, $text) = @_; my $table = find_table($page->{tables}, $y); my $col; if (defined $table) { $col = find_table_col($table, $x, $y); } $text =~ s/\\([()])/$1/g; my $i = { table => (defined($table) ? $table->{num} : undef), col => $col, text => $text, }; push @{$page->{text2}}, $i; } sub near { my ($a, $b) = @_; return abs($a - $b) < 1; } open my $fh, $filename or die "$filename: $!"; my (@text, $text, @lines, $page_num); my @parsed; while (<$fh>) { if ($_ eq "pdfEndPage\n") { $page_num > 344 and last; push @parsed, parse_page({ num => $page_num, lines => \@lines, text => \@text, }); @lines = (); @text = (); } if (/^%%Page: (\d+) \d+/) { $page_num = $1; @text = (); next; } if (/^\[(.* [1-9]\d*(\.\d+)?)\] Tm$/) { $1 =~ /^ ([1-9]\d*(?:\.\d+)?)\ 0\ 0 \ ([1-9]\d*(?:\.\d+)?) \ ([1-9]\d*(?:\.\d+)?) \ ([1-9]\d*(?:\.\d+)?)$/x or die $1; $text = { mul_x => $1, mul_y => $2, x => $3, y => $4, lines => [] }; push @text, $text; next; } if ($_ eq "[1 0 0 1 0 0] Tm\n") { undef $text } if (/^.* (Tj|Td|TJm)$/) { $text or next; push @{$text->{lines}}, $_; next; } if (/^.* Tf$/) { m{^/F(\d+)_0 1 Tf$} or die; @text or next; push @{$text->{lines}}, $_; next; } if (/(\d+\.\d+) (\d+\.\d+) (\d+\.\d+) (\d+\.\d+) re$/) { push @lines, { x => $1, y => $2, w => $3, h => $4 }; next; } } close $fh; print dump (\@parsed), "\n";