jbig2enc: pdf.py patched for right pdf page size

Discussions, questions, comments, ideas, and your projects having to do with DIY Book Scanner software. This includes the Stereo Data Maker software for the cameras, post-processing software, utilities, OCR packages, and so on.

Moderator: peterZ

Post Reply
User avatar
dingodog
Posts: 110
Joined: 22 Jul 2010, 18:19
Number of books owned: 1000
Country: on the net
Location: on the net
Contact:

jbig2enc: pdf.py patched for right pdf page size

Post by dingodog »

rubypdf posted a patch for pdf.py
https://github.com/agl/jbig2enc/issues/15

to generate proper pdf page size from jbig2enc encoded data

Code: Select all

jbig2 -s -p -v *.ext && pdf.py output>output.pdf
I applied patch and successful used the new pdf.py to generate proper page size

Code: Select all

#!/usr/bin/python

import sys
import re
import struct
import glob
import os

# This is a very simple script to make a PDF file out of the output of a
# multipage symbol compression.
# Run ./jbig2 -s -p <other options> image1.jpeg image1.jpeg ...
# python pdf.py output > out.pdf

#dpi = 600

class Ref:
  def __init__(self, x):
    self.x = x
  def __str__(self):
    return "%d 0 R" % self.x

class Dict:
  def __init__(self, values = {}):
    self.d = {}
    self.d.update(values)

  def __str__(self):
    s = ['<< ']
    for (x, y) in self.d.items():
      s.append('/%s ' % x)
      s.append(str(y))
      s.append("\n")
    s.append(">>\n")

    return ''.join(s)

global_next_id = 1

class Obj:
  next_id = 1
  def __init__(self, d = {}, stream = None):
    global global_next_id

    if stream is not None:
      d['Length'] = str(len(stream))
    self.d = Dict(d)
    self.stream = stream
    self.id = global_next_id
    global_next_id += 1

  def __str__(self):
    s = []
    s.append(str(self.d))
    if self.stream is not None:
      s.append('stream\n')
      s.append(self.stream)
      s.append('\nendstream\n')
    s.append('endobj\n')

    return ''.join(s)

class Doc:
  def __init__(self):
    self.objs = []
    self.pages = []

  def add_object(self, o):
    self.objs.append(o)
    return o

  def add_page(self, o):
    self.pages.append(o)
    return self.add_object(o)

  def __str__(self):
    a = []
    j = [0]
    offsets = []

    def add(x):
      a.append(x)
      j[0] += len(x) + 1
    add('%PDF-1.4')
    for o in self.objs:
      offsets.append(j[0])
      add('%d 0 obj' % o.id)
      add(str(o))
    xrefstart = j[0]
    a.append('xref')
    a.append('0 %d' % (len(offsets) + 1))
    a.append('0000000000 65535 f ')
    for o in offsets:
      a.append('%010d 00000 n ' % o)
    a.append('')
    a.append('trailer')
    a.append('<< /Size %d\n/Root 1 0 R >>' % (len(offsets) + 1))
    a.append('startxref')
    a.append(str(xrefstart))
    a.append('%%EOF')

    # sys.stderr.write(str(offsets) + "\n")

    return '\n'.join(a)

def ref(x):
  return '%d 0 R' % x

def main(symboltable='symboltable', pagefiles=glob.glob('page-*')):
  doc = Doc()
  doc.add_object(Obj({'Type' : '/Catalog', 'Outlines' : ref(2), 'Pages' : ref(3)}))
  doc.add_object(Obj({'Type' : '/Outlines', 'Count': '0'}))
  pages = Obj({'Type' : '/Pages'})
  doc.add_object(pages)
  symd = doc.add_object(Obj({}, file(symboltable, 'rb').read()))
  page_objs = []

  pagefiles.sort()
  for p in pagefiles:
    try:
      contents = file(p).read()
    except IOError:
      sys.stderr.write("error reading page file %s\n"% p)
      continue
    (width, height,xres,yres) = struct.unpack('>IIII', contents[11:27])
    xobj = Obj({'Type': '/XObject', 'Subtype': '/Image', 'Width':
        str(width), 'Height': str(height), 'ColorSpace': '/DeviceGray',
        'BitsPerComponent': '1', 'Filter': '/JBIG2Decode', 'DecodeParms':
        ' << /JBIG2Globals %d 0 R >>' % symd.id}, contents)
    contents = Obj({}, 'q %f 0 0 %f 0 0 cm /Im1 Do Q' % (float(width * 72) / xres, float(height * 72) / yres))
    resources = Obj({'ProcSet': '[/PDF /ImageB]',
        'XObject': '<< /Im1 %d 0 R >>' % xobj.id})
    page = Obj({'Type': '/Page', 'Parent': '3 0 R',
        'MediaBox': '[ 0 0 %f %f ]' % (float(width * 72) / xres, float(height * 72) / yres),
        'Contents': ref(contents.id),
        'Resources': ref(resources.id)})
    [doc.add_object(x) for x in [xobj, contents, resources, page]]
    page_objs.append(page)

    pages.d.d['Count'] = str(len(page_objs))
    pages.d.d['Kids'] = '[' + ' '.join([ref(x.id) for x in page_objs]) + ']'

  print str(doc)


def usage(script, msg):
  if msg:
    sys.stderr.write("%s: %s\n"% (script, msg))
  sys.stderr.write("Usage: %s [file_basename] > out.pdf\n"% script)
  sys.exit(1)


if __name__ == '__main__':
  if sys.platform == "win32":
    import msvcrt
    msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)

  if len(sys.argv) == 2:
    sym = sys.argv[1] + '.sym'
    pages = glob.glob(sys.argv[1] + '.[0-9]*')
  elif len(sys.argv) == 1:
    sym = 'symboltable'
    pages = glob.glob('page-*')
  else:
    usage(sys.argv[0], "wrong number of args!")

  if not os.path.exists(sym):
    usage(sys.argv[0], "symbol table %s not found!"% sym)
  elif len(pages) == 0:
    usage(sys.argv[0], "no pages found!")

  main(sym, pages)
Attachments
pdf.py-patched.zip
(1.81 KiB) Downloaded 368 times
User avatar
Misty
Posts: 481
Joined: 06 Nov 2009, 12:20
Number of books owned: 0
Location: Frozen Wasteland

Re: jbig2enc: pdf.py patched for right pdf page size

Post by Misty »

This patch is also now available in Homebrew for OS X users. If you have Homebrew, you can install it by doing

Code: Select all

brew install jbig2enc
If you already have jbig2enc installed, you can get the patched version by doing:

Code: Select all

brew uninstall jbig2enc && brew install jbig2enc
The opinions expressed in this post are my own and do not necessarily represent those of the Canadian Museum for Human Rights.
User avatar
dingodog
Posts: 110
Joined: 22 Jul 2010, 18:19
Number of books owned: 1000
Country: on the net
Location: on the net
Contact:

Re: jbig2enc: pdf.py patched for right pdf page size

Post by dingodog »

HU-HO! I spoked too soon!

We have a serious problem

if we edit images with a program that strip the original dpi info (settling to zero) we have this error:

Code: Select all

Traceback (most recent call last):
File "pdf.py", line 171, in
main(sym, pages)
File "pdf.py", line 129, in main
contents = Obj({}, 'q %f 0 0 %f 0 0 cm /Im1 Do Q' % (float(width * 72) / xres, float(height * 72) / yres))
ZeroDivisionError: float division
I attach the test image edited with mtPaint
Attachments
test-image.zip
(235.44 KiB) Downloaded 333 times
User avatar
dingodog
Posts: 110
Joined: 22 Jul 2010, 18:19
Number of books owned: 1000
Country: on the net
Location: on the net
Contact:

Re: jbig2enc: pdf.py patched for right pdf page size

Post by dingodog »

rubypdf fixed pdf.py

I attach source code:

Code: Select all

#!/usr/bin/python

import sys
import re
import struct
import glob
import os

# This is a very simple script to make a PDF file out of the output of a
# multipage symbol compression.
# Run ./jbig2 -s -p <other options> image1.jpeg image1.jpeg ...
# python pdf.py output > out.pdf

dpi = 72

class Ref:
  def __init__(self, x):
    self.x = x
  def __str__(self):
    return "%d 0 R" % self.x

class Dict:
  def __init__(self, values = {}):
    self.d = {}
    self.d.update(values)

  def __str__(self):
    s = ['<< ']
    for (x, y) in self.d.items():
      s.append('/%s ' % x)
      s.append(str(y))
      s.append("\n")
    s.append(">>\n")

    return ''.join(s)

global_next_id = 1

class Obj:
  next_id = 1
  def __init__(self, d = {}, stream = None):
    global global_next_id

    if stream is not None:
      d['Length'] = str(len(stream))
    self.d = Dict(d)
    self.stream = stream
    self.id = global_next_id
    global_next_id += 1

  def __str__(self):
    s = []
    s.append(str(self.d))
    if self.stream is not None:
      s.append('stream\n')
      s.append(self.stream)
      s.append('\nendstream\n')
    s.append('endobj\n')

    return ''.join(s)

class Doc:
  def __init__(self):
    self.objs = []
    self.pages = []

  def add_object(self, o):
    self.objs.append(o)
    return o

  def add_page(self, o):
    self.pages.append(o)
    return self.add_object(o)

  def __str__(self):
    a = []
    j = [0]
    offsets = []

    def add(x):
      a.append(x)
      j[0] += len(x) + 1
    add('%PDF-1.4')
    for o in self.objs:
      offsets.append(j[0])
      add('%d 0 obj' % o.id)
      add(str(o))
    xrefstart = j[0]
    a.append('xref')
    a.append('0 %d' % (len(offsets) + 1))
    a.append('0000000000 65535 f ')
    for o in offsets:
      a.append('%010d 00000 n ' % o)
    a.append('')
    a.append('trailer')
    a.append('<< /Size %d\n/Root 1 0 R >>' % (len(offsets) + 1))
    a.append('startxref')
    a.append(str(xrefstart))
    a.append('%%EOF')

    # sys.stderr.write(str(offsets) + "\n")

    return '\n'.join(a)

def ref(x):
  return '%d 0 R' % x

def main(symboltable='symboltable', pagefiles=glob.glob('page-*')):
  doc = Doc()
  doc.add_object(Obj({'Type' : '/Catalog', 'Outlines' : ref(2), 'Pages' : ref(3)}))
  doc.add_object(Obj({'Type' : '/Outlines', 'Count': '0'}))
  pages = Obj({'Type' : '/Pages'})
  doc.add_object(pages)
  symd = doc.add_object(Obj({}, file(symboltable, 'rb').read()))
  page_objs = []

  pagefiles.sort()
  for p in pagefiles:
    try:
      contents = file(p).read()
    except IOError:
      sys.stderr.write("error reading page file %s\n"% p)
      continue
    (width, height,xres,yres) = struct.unpack('>IIII', contents[11:27])
    
    if xres==0:
    	xres=dpi    	
    if yres==0:
    	yres=dpi
    	
    xobj = Obj({'Type': '/XObject', 'Subtype': '/Image', 'Width':
        str(width), 'Height': str(height), 'ColorSpace': '/DeviceGray',
        'BitsPerComponent': '1', 'Filter': '/JBIG2Decode', 'DecodeParms':
        ' << /JBIG2Globals %d 0 R >>' % symd.id}, contents)
    contents = Obj({}, 'q %f 0 0 %f 0 0 cm /Im1 Do Q' % (float(width * 72) / xres, float(height * 72) / yres))
    resources = Obj({'ProcSet': '[/PDF /ImageB]',
        'XObject': '<< /Im1 %d 0 R >>' % xobj.id})
    page = Obj({'Type': '/Page', 'Parent': '3 0 R',
        'MediaBox': '[ 0 0 %f %f ]' % (float(width * 72) / xres, float(height * 72) / yres),
        'Contents': ref(contents.id),
        'Resources': ref(resources.id)})
    [doc.add_object(x) for x in [xobj, contents, resources, page]]
    page_objs.append(page)

    pages.d.d['Count'] = str(len(page_objs))
    pages.d.d['Kids'] = '[' + ' '.join([ref(x.id) for x in page_objs]) + ']'

  print str(doc)


def usage(script, msg):
  if msg:
    sys.stderr.write("%s: %s\n"% (script, msg))
  sys.stderr.write("Usage: %s [file_basename] > out.pdf\n"% script)
  sys.exit(1)


if __name__ == '__main__':
  if sys.platform == "win32":
    import msvcrt
    msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)

  if len(sys.argv) == 2:
    sym = sys.argv[1] + '.sym'
    pages = glob.glob(sys.argv[1] + '.[0-9]*')
  elif len(sys.argv) == 1:
    sym = 'symboltable'
    pages = glob.glob('page-*')
  else:
    usage(sys.argv[0], "wrong number of args!")

  if not os.path.exists(sym):
    usage(sys.argv[0], "symbol table %s not found!"% sym)
  elif len(pages) == 0:
    usage(sys.argv[0], "no pages found!")

  main(sym, pages)
Attachments
fixed-pdf.py.zip
(1.84 KiB) Downloaded 344 times
rubypdf

Re: jbig2enc: pdf.py patched for right pdf page size

Post by rubypdf »

Thanks for your comments on my homepage, and I notice there exists another bug in jbig2enc(not pdf.py), when the image is bmp format, the output is mirrored.
Post Reply