Mageia Bugzilla – Attachment 4096 Details for
Bug 10402
gscan2pdf: wrong encodging wih gocr and tesseract not seen
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Requests
|
Help
|
Log In
[x]
|
New Account
|
Forgot Password
[patch]
improved patch, fixes also parsing of hocr (boxed ocr) output
gscan2pdf-gocr_output-tesseract_version-hocr_ocrx_word.patch (text/plain), 2.25 KB, created by
Pablo Saratxaga
on 2013-06-03 15:45:19 CEST
(
hide
)
Description:
improved patch, fixes also parsing of hocr (boxed ocr) output
Filename:
MIME Type:
Creator:
Pablo Saratxaga
Created:
2013-06-03 15:45:19 CEST
Size:
2.25 KB
patch
obsolete
>--- /usr/lib/perl5/vendor_perl/5.16.2/Gscan2pdf/Page.pm_bak 2013-06-03 15:35:27.123418902 +0200 >+++ /usr/lib/perl5/vendor_perl/5.16.2/Gscan2pdf/Page.pm 2013-06-03 15:38:35.631953049 +0200 >@@ -126,7 +126,7 @@ > if ( $token->[1] eq 'span' > and defined( $token->[2]{class} ) > and >- ( $token->[2]{class} eq 'ocr_line' or $token->[2]{class} eq 'ocr_word' ) >+ ( $token->[2]{class} eq 'ocr_line' or $token->[2]{class} =~ m/^ocrx*_word$/ ) > and defined( $token->[2]{title} ) > and $token->[2]{title} =~ /bbox\ (\d+)\ (\d+)\ (\d+)\ (\d+)/x ) > { >--- /usr/lib/perl5/vendor_perl/5.16.2/Gscan2pdf/Tesseract.pm_bak 2013-06-02 15:36:51.316070939 +0200 >+++ /usr/lib/perl5/vendor_perl/5.16.2/Gscan2pdf/Tesseract.pm 2013-06-03 15:30:27.249581906 +0200 >@@ -19,8 +19,7 @@ > return; > } > ( $tessdata, $version, $datasuffix ) = >- parse_tessdata(`tesseract '' '' -l '' 2>&1`); >- >+ parse_tessdata(`tesseract -v 2>&1 ; tesseract '' '' -l '' 2>&1`); > if ( not defined($tessdata) ) { > if ( defined($version) and $version > 3.01 ) { > my $exe = `which tesseract`; >@@ -44,7 +43,8 @@ > my @output = @_; > my $output = join ",", @output; > my ( $v, $suffix ); >- $v = $1 + 0 if ( $output =~ /\ v(\d\.\d\d)\ /x ); >+ $v = $1 + 0 if ( $output =~ /^tesseract\ (\d\.\d\d)\.\d+/x ); >+ $v = $1 + 0 if (!$v && $output =~ /\ v(\d\.\d\d)\ /x ); > while ( $output =~ /\n/x ) { > $output =~ s/\n.*$//gx; > } >@@ -159,7 +159,7 @@ > my $txt = File::Temp->new( SUFFIX => $suffix ); > ( my $name, my $path, undef ) = fileparse( $txt, $suffix ); > >- if ( $file !~ /\.tif$/x ) { >+ if ( $file !~ /\.(tif|png|gif|jpeg)$/x ) { > > # Temporary filename for new file > $tif = File::Temp->new( SUFFIX => '.tif' ); >--- /usr/lib/perl5/vendor_perl/5.16.2/Gscan2pdf.pm_bak 2013-06-02 21:51:25.326366002 +0200 >+++ /usr/lib/perl5/vendor_perl/5.16.2/Gscan2pdf.pm 2013-06-02 22:05:11.925765974 +0200 >@@ -1267,9 +1267,13 @@ > > my $new = $page->clone; > >- my $cmd = "gocr $pnm"; >+ my $txt = File::Temp->new( SUFFIX => '.txt' ); >+ my $cmd = "gocr -o $txt $pnm"; > $logger->info($cmd); >- $new->{hocr} = `echo $$ > $pidfile;$cmd`; >+print "$cmd \n"; >+ $cmd = "echo $$ > $pidfile;$cmd"; >+ system($cmd); >+ $new->{hocr} = Gscan2pdf::slurp($txt); > return if $_self->{cancel}; > $new->{ocr_flag} = 1; #FlagOCR > $new->{ocr_time} =
--- /usr/lib/perl5/vendor_perl/5.16.2/Gscan2pdf/Page.pm_bak 2013-06-03 15:35:27.123418902 +0200 +++ /usr/lib/perl5/vendor_perl/5.16.2/Gscan2pdf/Page.pm 2013-06-03 15:38:35.631953049 +0200 @@ -126,7 +126,7 @@ if ( $token->[1] eq 'span' and defined( $token->[2]{class} ) and - ( $token->[2]{class} eq 'ocr_line' or $token->[2]{class} eq 'ocr_word' ) + ( $token->[2]{class} eq 'ocr_line' or $token->[2]{class} =~ m/^ocrx*_word$/ ) and defined( $token->[2]{title} ) and $token->[2]{title} =~ /bbox\ (\d+)\ (\d+)\ (\d+)\ (\d+)/x ) { --- /usr/lib/perl5/vendor_perl/5.16.2/Gscan2pdf/Tesseract.pm_bak 2013-06-02 15:36:51.316070939 +0200 +++ /usr/lib/perl5/vendor_perl/5.16.2/Gscan2pdf/Tesseract.pm 2013-06-03 15:30:27.249581906 +0200 @@ -19,8 +19,7 @@ return; } ( $tessdata, $version, $datasuffix ) = - parse_tessdata(`tesseract '' '' -l '' 2>&1`); - + parse_tessdata(`tesseract -v 2>&1 ; tesseract '' '' -l '' 2>&1`); if ( not defined($tessdata) ) { if ( defined($version) and $version > 3.01 ) { my $exe = `which tesseract`; @@ -44,7 +43,8 @@ my @output = @_; my $output = join ",", @output; my ( $v, $suffix ); - $v = $1 + 0 if ( $output =~ /\ v(\d\.\d\d)\ /x ); + $v = $1 + 0 if ( $output =~ /^tesseract\ (\d\.\d\d)\.\d+/x ); + $v = $1 + 0 if (!$v && $output =~ /\ v(\d\.\d\d)\ /x ); while ( $output =~ /\n/x ) { $output =~ s/\n.*$//gx; } @@ -159,7 +159,7 @@ my $txt = File::Temp->new( SUFFIX => $suffix ); ( my $name, my $path, undef ) = fileparse( $txt, $suffix ); - if ( $file !~ /\.tif$/x ) { + if ( $file !~ /\.(tif|png|gif|jpeg)$/x ) { # Temporary filename for new file $tif = File::Temp->new( SUFFIX => '.tif' ); --- /usr/lib/perl5/vendor_perl/5.16.2/Gscan2pdf.pm_bak 2013-06-02 21:51:25.326366002 +0200 +++ /usr/lib/perl5/vendor_perl/5.16.2/Gscan2pdf.pm 2013-06-02 22:05:11.925765974 +0200 @@ -1267,9 +1267,13 @@ my $new = $page->clone; - my $cmd = "gocr $pnm"; + my $txt = File::Temp->new( SUFFIX => '.txt' ); + my $cmd = "gocr -o $txt $pnm"; $logger->info($cmd); - $new->{hocr} = `echo $$ > $pidfile;$cmd`; +print "$cmd \n"; + $cmd = "echo $$ > $pidfile;$cmd"; + system($cmd); + $new->{hocr} = Gscan2pdf::slurp($txt); return if $_self->{cancel}; $new->{ocr_flag} = 1; #FlagOCR $new->{ocr_time} =
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 10402
:
4093
|
4096
|
4166
|
4169
|
4191
|
4192