pdfstrip.py

   1 #!/usr/bin/env python3
   2 #
   3 # pdfstrip.py - strip objects from PDF files by specifying objects IDs
   4 #
   5 # Copyright (C) 2016  Antonio Ospite <ao2@ao2.it>
   6 #
   7 # This program is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19
  20 import argparse
  21 import logging
  22 import sys
  23
  24 from pdfrw import PdfReader, PdfWriter
  25 from pdfrw.objects.pdfindirect import PdfIndirect
  26
  27
  28 # pylint: disable=invalid-name
  29 logger = logging.getLogger(__name__)
  30 console_handler = logging.StreamHandler()
  31 formatter = logging.Formatter('[%(levelname)s] %(funcName)s:%(lineno)d %(message)s')
  32 console_handler.setFormatter(formatter)
  33 logger.addHandler(console_handler)
  34 logger.propagate = False
  35
  36
  37 def resolve_objects_names(pdf, objects_ids):
  38     logger.debug("objects_ids %s", objects_ids)
  39     objects_names = []
  40     for objnum in objects_ids:
  41         indirect_object = pdf.findindirect(objnum, 0)
  42         if isinstance(indirect_object, PdfIndirect):
  43             try:
  44                 real_object = indirect_object.real_value()
  45                 if real_object.Name:
  46                     objects_names.append(real_object.Name)
  47                 else:
  48                     logger.warning("Object %d has an empty 'Name' attribute", objnum)
  49             except AttributeError:
  50                 logger.warning("Object %d has no 'Name' attribute", objnum)
  51         else:
  52             logger.warning("Object %d is not a PdfIndirect but a %s",
  53                            objnum, type(indirect_object))
  54
  55     logger.debug("objects_names %s\n", objects_names)
  56     return objects_names
  57
  58
  59 def strip_objects(pdf, objects_names):
  60     for i, page in enumerate(pdf.pages):
  61         logger.debug("Page %d", i + 1)
  62         logger.debug("Before %s", page.Resources.XObject.keys())
  63         for obj in objects_names:
  64             if obj in page.Resources.XObject:
  65                 del page.Resources.XObject[obj]
  66
  67         logger.debug("After  %s\n", page.Resources.XObject.keys())
  68
  69     return pdf
  70
  71
  72 def validate_objects_ids(objects_ids_string):
  73     try:
  74         objects_ids = [int(obj) for obj in objects_ids_string.split(',')]
  75     except (IndexError, ValueError):
  76         raise argparse.ArgumentTypeError("%s contains an invalid value" % objects_ids_string)
  77
  78     return objects_ids
  79
  80
  81 def main():
  82     parser = argparse.ArgumentParser()
  83
  84     parser.add_argument("input_filename",
  85                         help="the input PDF file (better if uncompressed)")
  86     parser.add_argument("output_filename",
  87                         help="the output PDF file")
  88     parser.add_argument("objects_ids",
  89                         type=validate_objects_ids,
  90                         help="a comma-separated list of objects IDs")
  91     parser.add_argument("-d", "--debug", action="store_true",
  92                         help="enable debug output")
  93     args = parser.parse_args()
  94
  95     if args.debug:
  96         logger.setLevel(logging.DEBUG)
  97
  98     pdf_data = PdfReader(args.input_filename)
  99
 100     objects_names = resolve_objects_names(pdf_data, args.objects_ids)
 101     pdf_data = strip_objects(pdf_data, objects_names)
 102
 103     PdfWriter().write(args.output_filename, pdf_data)
 104
 105     return 0
 106
 107
 108 if __name__ == '__main__':
 109     sys.exit(main())