Mageia Bugzilla – Attachment 4166 Details for
Bug 10402
gscan2pdf: wrong encodging wih gocr and tesseract not seen
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Requests
|
Help
|
Log In
[x]
|
New Account
|
Forgot Password
[patch]
patch to fix gocr output, finding of tesseract version and parsing of hocr (boxed ocr) output
gscan2pdf-1.0.6-gocr-encoding-and-tesseract-version.patch (text/plain), 2.04 KB, created by
Pablo Saratxaga
on 2013-06-25 00:52:39 CEST
(
hide
)
Description:
patch to fix gocr output, finding of tesseract version and parsing of hocr (boxed ocr) output
Filename:
MIME Type:
Creator:
Pablo Saratxaga
Created:
2013-06-25 00:52:39 CEST
Size:
2.04 KB
patch
obsolete
>--- ./lib/Gscan2pdf/Tesseract.pm_bak 2013-06-25 00:29:31.495545094 +0200 >+++ ./lib/Gscan2pdf/Tesseract.pm 2013-06-25 00:33:02.302314948 +0200 >@@ -19,7 +19,7 @@ > return; > } > ( $tessdata, $version, $datasuffix ) = >- parse_tessdata(`tesseract '' '' -l '' 2>&1`); >+ parse_tessdata(`tesseract -v 2>&1 ; tesseract '' '' -l '' 2>&1`); > > if ( not defined($tessdata) ) { > if ( defined($version) and $version > 3.01 ) { >@@ -44,7 +44,8 @@ > my @output = @_; > my $output = join ",", @output; > my ( $v, $suffix ); >- $v = $1 + 0 if ( $output =~ /\ v(\d\.\d\d)\ /x ); >+ $v = $1 + 0 if ( $output =~ /^tesseract\ (\d\.\d\d)\.\d+/x ); >+ $v = $1 + 0 if (!$v && $output =~ /\ v(\d\.\d\d)\ /x ); > while ( $output =~ /\n/x ) { > $output =~ s/\n.*$//gx; > } >@@ -159,7 +160,7 @@ > my $txt = File::Temp->new( SUFFIX => $suffix ); > ( my $name, my $path, undef ) = fileparse( $txt, $suffix ); > >- if ( $file !~ /\.tif$/x ) { >+ if ( $file !~ /\.(tif|png|gif|jpeg)$/x ) { > > # Temporary filename for new file > $tif = File::Temp->new( SUFFIX => '.tif' ); >--- ./lib/Gscan2pdf/Page.pm_bak 2013-06-25 00:27:38.948253147 +0200 >+++ ./lib/Gscan2pdf/Page.pm 2013-06-25 00:29:14.558394303 +0200 >@@ -126,7 +126,7 @@ > if ( $token->[1] eq 'span' > and defined( $token->[2]{class} ) > and >- ( $token->[2]{class} eq 'ocr_line' or $token->[2]{class} eq 'ocr_word' ) >+ ( $token->[2]{class} eq 'ocr_line' or $token->[2]{class} =~ m/^ocrx*_word$/ ) > and defined( $token->[2]{title} ) > and $token->[2]{title} =~ /bbox\ (\d+)\ (\d+)\ (\d+)\ (\d+)/x ) > { >--- ./lib/Gscan2pdf.pm_bak 2013-06-25 00:33:34.981464831 +0200 >+++ ./lib/Gscan2pdf.pm 2013-06-25 00:35:49.036488405 +0200 >@@ -1267,9 +1267,12 @@ > > my $new = $page->clone; > >- my $cmd = "gocr $pnm"; >+ my $txt = File::Temp->new( SUFFIX => '.txt' ); >+ my $cmd = "gocr -o $txt $pnm"; > $logger->info($cmd); >- $new->{hocr} = `echo $$ > $pidfile;$cmd`; >+ $cmd = "echo $$ > $pidfile;$cmd"; >+ system($cmd); >+ $new->{hocr} = Gscan2pdf::slurp($txt); > return if $_self->{cancel}; > $new->{ocr_flag} = 1; #FlagOCR > $new->{ocr_time} =
--- ./lib/Gscan2pdf/Tesseract.pm_bak 2013-06-25 00:29:31.495545094 +0200 +++ ./lib/Gscan2pdf/Tesseract.pm 2013-06-25 00:33:02.302314948 +0200 @@ -19,7 +19,7 @@ return; } ( $tessdata, $version, $datasuffix ) = - parse_tessdata(`tesseract '' '' -l '' 2>&1`); + parse_tessdata(`tesseract -v 2>&1 ; tesseract '' '' -l '' 2>&1`); if ( not defined($tessdata) ) { if ( defined($version) and $version > 3.01 ) { @@ -44,7 +44,8 @@ my @output = @_; my $output = join ",", @output; my ( $v, $suffix ); - $v = $1 + 0 if ( $output =~ /\ v(\d\.\d\d)\ /x ); + $v = $1 + 0 if ( $output =~ /^tesseract\ (\d\.\d\d)\.\d+/x ); + $v = $1 + 0 if (!$v && $output =~ /\ v(\d\.\d\d)\ /x ); while ( $output =~ /\n/x ) { $output =~ s/\n.*$//gx; } @@ -159,7 +160,7 @@ my $txt = File::Temp->new( SUFFIX => $suffix ); ( my $name, my $path, undef ) = fileparse( $txt, $suffix ); - if ( $file !~ /\.tif$/x ) { + if ( $file !~ /\.(tif|png|gif|jpeg)$/x ) { # Temporary filename for new file $tif = File::Temp->new( SUFFIX => '.tif' ); --- ./lib/Gscan2pdf/Page.pm_bak 2013-06-25 00:27:38.948253147 +0200 +++ ./lib/Gscan2pdf/Page.pm 2013-06-25 00:29:14.558394303 +0200 @@ -126,7 +126,7 @@ if ( $token->[1] eq 'span' and defined( $token->[2]{class} ) and - ( $token->[2]{class} eq 'ocr_line' or $token->[2]{class} eq 'ocr_word' ) + ( $token->[2]{class} eq 'ocr_line' or $token->[2]{class} =~ m/^ocrx*_word$/ ) and defined( $token->[2]{title} ) and $token->[2]{title} =~ /bbox\ (\d+)\ (\d+)\ (\d+)\ (\d+)/x ) { --- ./lib/Gscan2pdf.pm_bak 2013-06-25 00:33:34.981464831 +0200 +++ ./lib/Gscan2pdf.pm 2013-06-25 00:35:49.036488405 +0200 @@ -1267,9 +1267,12 @@ my $new = $page->clone; - my $cmd = "gocr $pnm"; + my $txt = File::Temp->new( SUFFIX => '.txt' ); + my $cmd = "gocr -o $txt $pnm"; $logger->info($cmd); - $new->{hocr} = `echo $$ > $pidfile;$cmd`; + $cmd = "echo $$ > $pidfile;$cmd"; + system($cmd); + $new->{hocr} = Gscan2pdf::slurp($txt); return if $_self->{cancel}; $new->{ocr_flag} = 1; #FlagOCR $new->{ocr_time} =
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 10402
:
4093
|
4096
|
4166
|
4169
|
4191
|
4192