3 # pdfstrip.py - strip objects from PDF files by specifying objects IDs
5 # Copyright (C) 2016 Antonio Ospite <ao2@ao2.it>
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
24 from pdfrw import PdfReader, PdfWriter
25 from pdfrw.objects.pdfindirect import PdfIndirect
28 # pylint: disable=invalid-name
29 logger = logging.getLogger(__name__)
30 console_handler = logging.StreamHandler()
31 formatter = logging.Formatter('[%(levelname)s] %(funcName)s:%(lineno)d %(message)s')
32 console_handler.setFormatter(formatter)
33 logger.addHandler(console_handler)
34 logger.propagate = False
37 def resolve_objects_names(pdf, objects_ids):
38 logger.debug("objects_ids %s", objects_ids)
40 for objnum in objects_ids:
41 indirect_object = pdf.findindirect(objnum, 0)
42 if isinstance(indirect_object, PdfIndirect):
44 real_object = indirect_object.real_value()
46 objects_names.append(real_object.Name)
48 logger.warning("Object %d has an empty 'Name' attribute", objnum)
49 except AttributeError:
50 logger.warning("Object %d has no 'Name' attribute", objnum)
52 logger.warning("Object %d is not a PdfIndirect but a %s",
53 objnum, type(indirect_object))
55 logger.debug("objects_names %s\n", objects_names)
59 def strip_objects(pdf, objects_names):
60 for i, page in enumerate(pdf.pages):
61 logger.debug("Page %d", i + 1)
62 logger.debug("Before %s", page.Resources.XObject.keys())
63 for obj in objects_names:
64 if obj in page.Resources.XObject:
65 del page.Resources.XObject[obj]
67 logger.debug("After %s\n", page.Resources.XObject.keys())
72 def validate_objects_ids(objects_ids_string):
74 objects_ids = [int(obj) for obj in objects_ids_string.split(',')]
75 except (IndexError, ValueError):
76 raise argparse.ArgumentTypeError("%s contains an invalid value" % objects_ids_string)
82 parser = argparse.ArgumentParser()
84 parser.add_argument("input_filename",
85 help="the input PDF file (better if uncompressed)")
86 parser.add_argument("output_filename",
87 help="the output PDF file")
88 parser.add_argument("objects_ids",
89 type=validate_objects_ids,
90 help="a comma-separated list of objects IDs")
91 parser.add_argument("-d", "--debug", action="store_true",
92 help="enable debug output")
93 args = parser.parse_args()
96 logger.setLevel(logging.DEBUG)
98 pdf_data = PdfReader(args.input_filename)
100 objects_names = resolve_objects_names(pdf_data, args.objects_ids)
101 pdf_data = strip_objects(pdf_data, objects_names)
103 PdfWriter().write(args.output_filename, pdf_data)
108 if __name__ == '__main__':