PdfParser.py 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073
  1. from __future__ import annotations
  2. import calendar
  3. import codecs
  4. import collections
  5. import mmap
  6. import os
  7. import re
  8. import time
  9. import zlib
  10. from typing import IO, TYPE_CHECKING, Any, NamedTuple, Union
  11. # see 7.9.2.2 Text String Type on page 86 and D.3 PDFDocEncoding Character Set
  12. # on page 656
  13. def encode_text(s: str) -> bytes:
  14. return codecs.BOM_UTF16_BE + s.encode("utf_16_be")
  15. PDFDocEncoding = {
  16. 0x16: "\u0017",
  17. 0x18: "\u02D8",
  18. 0x19: "\u02C7",
  19. 0x1A: "\u02C6",
  20. 0x1B: "\u02D9",
  21. 0x1C: "\u02DD",
  22. 0x1D: "\u02DB",
  23. 0x1E: "\u02DA",
  24. 0x1F: "\u02DC",
  25. 0x80: "\u2022",
  26. 0x81: "\u2020",
  27. 0x82: "\u2021",
  28. 0x83: "\u2026",
  29. 0x84: "\u2014",
  30. 0x85: "\u2013",
  31. 0x86: "\u0192",
  32. 0x87: "\u2044",
  33. 0x88: "\u2039",
  34. 0x89: "\u203A",
  35. 0x8A: "\u2212",
  36. 0x8B: "\u2030",
  37. 0x8C: "\u201E",
  38. 0x8D: "\u201C",
  39. 0x8E: "\u201D",
  40. 0x8F: "\u2018",
  41. 0x90: "\u2019",
  42. 0x91: "\u201A",
  43. 0x92: "\u2122",
  44. 0x93: "\uFB01",
  45. 0x94: "\uFB02",
  46. 0x95: "\u0141",
  47. 0x96: "\u0152",
  48. 0x97: "\u0160",
  49. 0x98: "\u0178",
  50. 0x99: "\u017D",
  51. 0x9A: "\u0131",
  52. 0x9B: "\u0142",
  53. 0x9C: "\u0153",
  54. 0x9D: "\u0161",
  55. 0x9E: "\u017E",
  56. 0xA0: "\u20AC",
  57. }
  58. def decode_text(b: bytes) -> str:
  59. if b[: len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE:
  60. return b[len(codecs.BOM_UTF16_BE) :].decode("utf_16_be")
  61. else:
  62. return "".join(PDFDocEncoding.get(byte, chr(byte)) for byte in b)
  63. class PdfFormatError(RuntimeError):
  64. """An error that probably indicates a syntactic or semantic error in the
  65. PDF file structure"""
  66. pass
  67. def check_format_condition(condition: bool, error_message: str) -> None:
  68. if not condition:
  69. raise PdfFormatError(error_message)
  70. class IndirectReferenceTuple(NamedTuple):
  71. object_id: int
  72. generation: int
  73. class IndirectReference(IndirectReferenceTuple):
  74. def __str__(self) -> str:
  75. return f"{self.object_id} {self.generation} R"
  76. def __bytes__(self) -> bytes:
  77. return self.__str__().encode("us-ascii")
  78. def __eq__(self, other: object) -> bool:
  79. if self.__class__ is not other.__class__:
  80. return False
  81. assert isinstance(other, IndirectReference)
  82. return other.object_id == self.object_id and other.generation == self.generation
  83. def __ne__(self, other: object) -> bool:
  84. return not (self == other)
  85. def __hash__(self) -> int:
  86. return hash((self.object_id, self.generation))
  87. class IndirectObjectDef(IndirectReference):
  88. def __str__(self) -> str:
  89. return f"{self.object_id} {self.generation} obj"
  90. class XrefTable:
  91. def __init__(self) -> None:
  92. self.existing_entries: dict[int, tuple[int, int]] = (
  93. {}
  94. ) # object ID => (offset, generation)
  95. self.new_entries: dict[int, tuple[int, int]] = (
  96. {}
  97. ) # object ID => (offset, generation)
  98. self.deleted_entries = {0: 65536} # object ID => generation
  99. self.reading_finished = False
  100. def __setitem__(self, key: int, value: tuple[int, int]) -> None:
  101. if self.reading_finished:
  102. self.new_entries[key] = value
  103. else:
  104. self.existing_entries[key] = value
  105. if key in self.deleted_entries:
  106. del self.deleted_entries[key]
  107. def __getitem__(self, key: int) -> tuple[int, int]:
  108. try:
  109. return self.new_entries[key]
  110. except KeyError:
  111. return self.existing_entries[key]
  112. def __delitem__(self, key: int) -> None:
  113. if key in self.new_entries:
  114. generation = self.new_entries[key][1] + 1
  115. del self.new_entries[key]
  116. self.deleted_entries[key] = generation
  117. elif key in self.existing_entries:
  118. generation = self.existing_entries[key][1] + 1
  119. self.deleted_entries[key] = generation
  120. elif key in self.deleted_entries:
  121. generation = self.deleted_entries[key]
  122. else:
  123. msg = f"object ID {key} cannot be deleted because it doesn't exist"
  124. raise IndexError(msg)
  125. def __contains__(self, key: int) -> bool:
  126. return key in self.existing_entries or key in self.new_entries
  127. def __len__(self) -> int:
  128. return len(
  129. set(self.existing_entries.keys())
  130. | set(self.new_entries.keys())
  131. | set(self.deleted_entries.keys())
  132. )
  133. def keys(self) -> set[int]:
  134. return (
  135. set(self.existing_entries.keys()) - set(self.deleted_entries.keys())
  136. ) | set(self.new_entries.keys())
  137. def write(self, f: IO[bytes]) -> int:
  138. keys = sorted(set(self.new_entries.keys()) | set(self.deleted_entries.keys()))
  139. deleted_keys = sorted(set(self.deleted_entries.keys()))
  140. startxref = f.tell()
  141. f.write(b"xref\n")
  142. while keys:
  143. # find a contiguous sequence of object IDs
  144. prev: int | None = None
  145. for index, key in enumerate(keys):
  146. if prev is None or prev + 1 == key:
  147. prev = key
  148. else:
  149. contiguous_keys = keys[:index]
  150. keys = keys[index:]
  151. break
  152. else:
  153. contiguous_keys = keys
  154. keys = []
  155. f.write(b"%d %d\n" % (contiguous_keys[0], len(contiguous_keys)))
  156. for object_id in contiguous_keys:
  157. if object_id in self.new_entries:
  158. f.write(b"%010d %05d n \n" % self.new_entries[object_id])
  159. else:
  160. this_deleted_object_id = deleted_keys.pop(0)
  161. check_format_condition(
  162. object_id == this_deleted_object_id,
  163. f"expected the next deleted object ID to be {object_id}, "
  164. f"instead found {this_deleted_object_id}",
  165. )
  166. try:
  167. next_in_linked_list = deleted_keys[0]
  168. except IndexError:
  169. next_in_linked_list = 0
  170. f.write(
  171. b"%010d %05d f \n"
  172. % (next_in_linked_list, self.deleted_entries[object_id])
  173. )
  174. return startxref
  175. class PdfName:
  176. name: bytes
  177. def __init__(self, name: PdfName | bytes | str) -> None:
  178. if isinstance(name, PdfName):
  179. self.name = name.name
  180. elif isinstance(name, bytes):
  181. self.name = name
  182. else:
  183. self.name = name.encode("us-ascii")
  184. def name_as_str(self) -> str:
  185. return self.name.decode("us-ascii")
  186. def __eq__(self, other: object) -> bool:
  187. return (
  188. isinstance(other, PdfName) and other.name == self.name
  189. ) or other == self.name
  190. def __hash__(self) -> int:
  191. return hash(self.name)
  192. def __repr__(self) -> str:
  193. return f"{self.__class__.__name__}({repr(self.name)})"
  194. @classmethod
  195. def from_pdf_stream(cls, data: bytes) -> PdfName:
  196. return cls(PdfParser.interpret_name(data))
  197. allowed_chars = set(range(33, 127)) - {ord(c) for c in "#%/()<>[]{}"}
  198. def __bytes__(self) -> bytes:
  199. result = bytearray(b"/")
  200. for b in self.name:
  201. if b in self.allowed_chars:
  202. result.append(b)
  203. else:
  204. result.extend(b"#%02X" % b)
  205. return bytes(result)
  206. class PdfArray(list[Any]):
  207. def __bytes__(self) -> bytes:
  208. return b"[ " + b" ".join(pdf_repr(x) for x in self) + b" ]"
  209. if TYPE_CHECKING:
  210. _DictBase = collections.UserDict[Union[str, bytes], Any]
  211. else:
  212. _DictBase = collections.UserDict
  213. class PdfDict(_DictBase):
  214. def __setattr__(self, key: str, value: Any) -> None:
  215. if key == "data":
  216. collections.UserDict.__setattr__(self, key, value)
  217. else:
  218. self[key.encode("us-ascii")] = value
  219. def __getattr__(self, key: str) -> str | time.struct_time:
  220. try:
  221. value = self[key.encode("us-ascii")]
  222. except KeyError as e:
  223. raise AttributeError(key) from e
  224. if isinstance(value, bytes):
  225. value = decode_text(value)
  226. if key.endswith("Date"):
  227. if value.startswith("D:"):
  228. value = value[2:]
  229. relationship = "Z"
  230. if len(value) > 17:
  231. relationship = value[14]
  232. offset = int(value[15:17]) * 60
  233. if len(value) > 20:
  234. offset += int(value[18:20])
  235. format = "%Y%m%d%H%M%S"[: len(value) - 2]
  236. value = time.strptime(value[: len(format) + 2], format)
  237. if relationship in ["+", "-"]:
  238. offset *= 60
  239. if relationship == "+":
  240. offset *= -1
  241. value = time.gmtime(calendar.timegm(value) + offset)
  242. return value
  243. def __bytes__(self) -> bytes:
  244. out = bytearray(b"<<")
  245. for key, value in self.items():
  246. if value is None:
  247. continue
  248. value = pdf_repr(value)
  249. out.extend(b"\n")
  250. out.extend(bytes(PdfName(key)))
  251. out.extend(b" ")
  252. out.extend(value)
  253. out.extend(b"\n>>")
  254. return bytes(out)
  255. class PdfBinary:
  256. def __init__(self, data: list[int] | bytes) -> None:
  257. self.data = data
  258. def __bytes__(self) -> bytes:
  259. return b"<%s>" % b"".join(b"%02X" % b for b in self.data)
  260. class PdfStream:
  261. def __init__(self, dictionary: PdfDict, buf: bytes) -> None:
  262. self.dictionary = dictionary
  263. self.buf = buf
  264. def decode(self) -> bytes:
  265. try:
  266. filter = self.dictionary[b"Filter"]
  267. except KeyError:
  268. return self.buf
  269. if filter == b"FlateDecode":
  270. try:
  271. expected_length = self.dictionary[b"DL"]
  272. except KeyError:
  273. expected_length = self.dictionary[b"Length"]
  274. return zlib.decompress(self.buf, bufsize=int(expected_length))
  275. else:
  276. msg = f"stream filter {repr(filter)} unknown/unsupported"
  277. raise NotImplementedError(msg)
  278. def pdf_repr(x: Any) -> bytes:
  279. if x is True:
  280. return b"true"
  281. elif x is False:
  282. return b"false"
  283. elif x is None:
  284. return b"null"
  285. elif isinstance(x, (PdfName, PdfDict, PdfArray, PdfBinary)):
  286. return bytes(x)
  287. elif isinstance(x, (int, float)):
  288. return str(x).encode("us-ascii")
  289. elif isinstance(x, time.struct_time):
  290. return b"(D:" + time.strftime("%Y%m%d%H%M%SZ", x).encode("us-ascii") + b")"
  291. elif isinstance(x, dict):
  292. return bytes(PdfDict(x))
  293. elif isinstance(x, list):
  294. return bytes(PdfArray(x))
  295. elif isinstance(x, str):
  296. return pdf_repr(encode_text(x))
  297. elif isinstance(x, bytes):
  298. # XXX escape more chars? handle binary garbage
  299. x = x.replace(b"\\", b"\\\\")
  300. x = x.replace(b"(", b"\\(")
  301. x = x.replace(b")", b"\\)")
  302. return b"(" + x + b")"
  303. else:
  304. return bytes(x)
  305. class PdfParser:
  306. """Based on
  307. https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf
  308. Supports PDF up to 1.4
  309. """
  310. def __init__(
  311. self,
  312. filename: str | None = None,
  313. f: IO[bytes] | None = None,
  314. buf: bytes | bytearray | None = None,
  315. start_offset: int = 0,
  316. mode: str = "rb",
  317. ) -> None:
  318. if buf and f:
  319. msg = "specify buf or f or filename, but not both buf and f"
  320. raise RuntimeError(msg)
  321. self.filename = filename
  322. self.buf: bytes | bytearray | mmap.mmap | None = buf
  323. self.f = f
  324. self.start_offset = start_offset
  325. self.should_close_buf = False
  326. self.should_close_file = False
  327. if filename is not None and f is None:
  328. self.f = f = open(filename, mode)
  329. self.should_close_file = True
  330. if f is not None:
  331. self.buf = self.get_buf_from_file(f)
  332. self.should_close_buf = True
  333. if not filename and hasattr(f, "name"):
  334. self.filename = f.name
  335. self.cached_objects: dict[IndirectReference, Any] = {}
  336. self.root_ref: IndirectReference | None
  337. self.info_ref: IndirectReference | None
  338. self.pages_ref: IndirectReference | None
  339. self.last_xref_section_offset: int | None
  340. if self.buf:
  341. self.read_pdf_info()
  342. else:
  343. self.file_size_total = self.file_size_this = 0
  344. self.root = PdfDict()
  345. self.root_ref = None
  346. self.info = PdfDict()
  347. self.info_ref = None
  348. self.page_tree_root = PdfDict()
  349. self.pages: list[IndirectReference] = []
  350. self.orig_pages: list[IndirectReference] = []
  351. self.pages_ref = None
  352. self.last_xref_section_offset = None
  353. self.trailer_dict: dict[bytes, Any] = {}
  354. self.xref_table = XrefTable()
  355. self.xref_table.reading_finished = True
  356. if f:
  357. self.seek_end()
  358. def __enter__(self) -> PdfParser:
  359. return self
  360. def __exit__(self, *args: object) -> None:
  361. self.close()
  362. def start_writing(self) -> None:
  363. self.close_buf()
  364. self.seek_end()
  365. def close_buf(self) -> None:
  366. if isinstance(self.buf, mmap.mmap):
  367. self.buf.close()
  368. self.buf = None
  369. def close(self) -> None:
  370. if self.should_close_buf:
  371. self.close_buf()
  372. if self.f is not None and self.should_close_file:
  373. self.f.close()
  374. self.f = None
  375. def seek_end(self) -> None:
  376. assert self.f is not None
  377. self.f.seek(0, os.SEEK_END)
  378. def write_header(self) -> None:
  379. assert self.f is not None
  380. self.f.write(b"%PDF-1.4\n")
  381. def write_comment(self, s: str) -> None:
  382. assert self.f is not None
  383. self.f.write(f"% {s}\n".encode())
  384. def write_catalog(self) -> IndirectReference:
  385. assert self.f is not None
  386. self.del_root()
  387. self.root_ref = self.next_object_id(self.f.tell())
  388. self.pages_ref = self.next_object_id(0)
  389. self.rewrite_pages()
  390. self.write_obj(self.root_ref, Type=PdfName(b"Catalog"), Pages=self.pages_ref)
  391. self.write_obj(
  392. self.pages_ref,
  393. Type=PdfName(b"Pages"),
  394. Count=len(self.pages),
  395. Kids=self.pages,
  396. )
  397. return self.root_ref
  398. def rewrite_pages(self) -> None:
  399. pages_tree_nodes_to_delete = []
  400. for i, page_ref in enumerate(self.orig_pages):
  401. page_info = self.cached_objects[page_ref]
  402. del self.xref_table[page_ref.object_id]
  403. pages_tree_nodes_to_delete.append(page_info[PdfName(b"Parent")])
  404. if page_ref not in self.pages:
  405. # the page has been deleted
  406. continue
  407. # make dict keys into strings for passing to write_page
  408. stringified_page_info = {}
  409. for key, value in page_info.items():
  410. # key should be a PdfName
  411. stringified_page_info[key.name_as_str()] = value
  412. stringified_page_info["Parent"] = self.pages_ref
  413. new_page_ref = self.write_page(None, **stringified_page_info)
  414. for j, cur_page_ref in enumerate(self.pages):
  415. if cur_page_ref == page_ref:
  416. # replace the page reference with the new one
  417. self.pages[j] = new_page_ref
  418. # delete redundant Pages tree nodes from xref table
  419. for pages_tree_node_ref in pages_tree_nodes_to_delete:
  420. while pages_tree_node_ref:
  421. pages_tree_node = self.cached_objects[pages_tree_node_ref]
  422. if pages_tree_node_ref.object_id in self.xref_table:
  423. del self.xref_table[pages_tree_node_ref.object_id]
  424. pages_tree_node_ref = pages_tree_node.get(b"Parent", None)
  425. self.orig_pages = []
  426. def write_xref_and_trailer(
  427. self, new_root_ref: IndirectReference | None = None
  428. ) -> None:
  429. assert self.f is not None
  430. if new_root_ref:
  431. self.del_root()
  432. self.root_ref = new_root_ref
  433. if self.info:
  434. self.info_ref = self.write_obj(None, self.info)
  435. start_xref = self.xref_table.write(self.f)
  436. num_entries = len(self.xref_table)
  437. trailer_dict: dict[str | bytes, Any] = {
  438. b"Root": self.root_ref,
  439. b"Size": num_entries,
  440. }
  441. if self.last_xref_section_offset is not None:
  442. trailer_dict[b"Prev"] = self.last_xref_section_offset
  443. if self.info:
  444. trailer_dict[b"Info"] = self.info_ref
  445. self.last_xref_section_offset = start_xref
  446. self.f.write(
  447. b"trailer\n"
  448. + bytes(PdfDict(trailer_dict))
  449. + b"\nstartxref\n%d\n%%%%EOF" % start_xref
  450. )
  451. def write_page(
  452. self, ref: int | IndirectReference | None, *objs: Any, **dict_obj: Any
  453. ) -> IndirectReference:
  454. obj_ref = self.pages[ref] if isinstance(ref, int) else ref
  455. if "Type" not in dict_obj:
  456. dict_obj["Type"] = PdfName(b"Page")
  457. if "Parent" not in dict_obj:
  458. dict_obj["Parent"] = self.pages_ref
  459. return self.write_obj(obj_ref, *objs, **dict_obj)
  460. def write_obj(
  461. self, ref: IndirectReference | None, *objs: Any, **dict_obj: Any
  462. ) -> IndirectReference:
  463. assert self.f is not None
  464. f = self.f
  465. if ref is None:
  466. ref = self.next_object_id(f.tell())
  467. else:
  468. self.xref_table[ref.object_id] = (f.tell(), ref.generation)
  469. f.write(bytes(IndirectObjectDef(*ref)))
  470. stream = dict_obj.pop("stream", None)
  471. if stream is not None:
  472. dict_obj["Length"] = len(stream)
  473. if dict_obj:
  474. f.write(pdf_repr(dict_obj))
  475. for obj in objs:
  476. f.write(pdf_repr(obj))
  477. if stream is not None:
  478. f.write(b"stream\n")
  479. f.write(stream)
  480. f.write(b"\nendstream\n")
  481. f.write(b"endobj\n")
  482. return ref
  483. def del_root(self) -> None:
  484. if self.root_ref is None:
  485. return
  486. del self.xref_table[self.root_ref.object_id]
  487. del self.xref_table[self.root[b"Pages"].object_id]
  488. @staticmethod
  489. def get_buf_from_file(f: IO[bytes]) -> bytes | mmap.mmap:
  490. if hasattr(f, "getbuffer"):
  491. return f.getbuffer()
  492. elif hasattr(f, "getvalue"):
  493. return f.getvalue()
  494. else:
  495. try:
  496. return mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
  497. except ValueError: # cannot mmap an empty file
  498. return b""
  499. def read_pdf_info(self) -> None:
  500. assert self.buf is not None
  501. self.file_size_total = len(self.buf)
  502. self.file_size_this = self.file_size_total - self.start_offset
  503. self.read_trailer()
  504. check_format_condition(
  505. self.trailer_dict.get(b"Root") is not None, "Root is missing"
  506. )
  507. self.root_ref = self.trailer_dict[b"Root"]
  508. assert self.root_ref is not None
  509. self.info_ref = self.trailer_dict.get(b"Info", None)
  510. self.root = PdfDict(self.read_indirect(self.root_ref))
  511. if self.info_ref is None:
  512. self.info = PdfDict()
  513. else:
  514. self.info = PdfDict(self.read_indirect(self.info_ref))
  515. check_format_condition(b"Type" in self.root, "/Type missing in Root")
  516. check_format_condition(
  517. self.root[b"Type"] == b"Catalog", "/Type in Root is not /Catalog"
  518. )
  519. check_format_condition(
  520. self.root.get(b"Pages") is not None, "/Pages missing in Root"
  521. )
  522. check_format_condition(
  523. isinstance(self.root[b"Pages"], IndirectReference),
  524. "/Pages in Root is not an indirect reference",
  525. )
  526. self.pages_ref = self.root[b"Pages"]
  527. assert self.pages_ref is not None
  528. self.page_tree_root = self.read_indirect(self.pages_ref)
  529. self.pages = self.linearize_page_tree(self.page_tree_root)
  530. # save the original list of page references
  531. # in case the user modifies, adds or deletes some pages
  532. # and we need to rewrite the pages and their list
  533. self.orig_pages = self.pages[:]
  534. def next_object_id(self, offset: int | None = None) -> IndirectReference:
  535. try:
  536. # TODO: support reuse of deleted objects
  537. reference = IndirectReference(max(self.xref_table.keys()) + 1, 0)
  538. except ValueError:
  539. reference = IndirectReference(1, 0)
  540. if offset is not None:
  541. self.xref_table[reference.object_id] = (offset, 0)
  542. return reference
  543. delimiter = rb"[][()<>{}/%]"
  544. delimiter_or_ws = rb"[][()<>{}/%\000\011\012\014\015\040]"
  545. whitespace = rb"[\000\011\012\014\015\040]"
  546. whitespace_or_hex = rb"[\000\011\012\014\015\0400-9a-fA-F]"
  547. whitespace_optional = whitespace + b"*"
  548. whitespace_mandatory = whitespace + b"+"
  549. # No "\012" aka "\n" or "\015" aka "\r":
  550. whitespace_optional_no_nl = rb"[\000\011\014\040]*"
  551. newline_only = rb"[\r\n]+"
  552. newline = whitespace_optional_no_nl + newline_only + whitespace_optional_no_nl
  553. re_trailer_end = re.compile(
  554. whitespace_mandatory
  555. + rb"trailer"
  556. + whitespace_optional
  557. + rb"<<(.*>>)"
  558. + newline
  559. + rb"startxref"
  560. + newline
  561. + rb"([0-9]+)"
  562. + newline
  563. + rb"%%EOF"
  564. + whitespace_optional
  565. + rb"$",
  566. re.DOTALL,
  567. )
  568. re_trailer_prev = re.compile(
  569. whitespace_optional
  570. + rb"trailer"
  571. + whitespace_optional
  572. + rb"<<(.*?>>)"
  573. + newline
  574. + rb"startxref"
  575. + newline
  576. + rb"([0-9]+)"
  577. + newline
  578. + rb"%%EOF"
  579. + whitespace_optional,
  580. re.DOTALL,
  581. )
  582. def read_trailer(self) -> None:
  583. assert self.buf is not None
  584. search_start_offset = len(self.buf) - 16384
  585. if search_start_offset < self.start_offset:
  586. search_start_offset = self.start_offset
  587. m = self.re_trailer_end.search(self.buf, search_start_offset)
  588. check_format_condition(m is not None, "trailer end not found")
  589. # make sure we found the LAST trailer
  590. last_match = m
  591. while m:
  592. last_match = m
  593. m = self.re_trailer_end.search(self.buf, m.start() + 16)
  594. if not m:
  595. m = last_match
  596. assert m is not None
  597. trailer_data = m.group(1)
  598. self.last_xref_section_offset = int(m.group(2))
  599. self.trailer_dict = self.interpret_trailer(trailer_data)
  600. self.xref_table = XrefTable()
  601. self.read_xref_table(xref_section_offset=self.last_xref_section_offset)
  602. if b"Prev" in self.trailer_dict:
  603. self.read_prev_trailer(self.trailer_dict[b"Prev"])
  604. def read_prev_trailer(self, xref_section_offset: int) -> None:
  605. assert self.buf is not None
  606. trailer_offset = self.read_xref_table(xref_section_offset=xref_section_offset)
  607. m = self.re_trailer_prev.search(
  608. self.buf[trailer_offset : trailer_offset + 16384]
  609. )
  610. check_format_condition(m is not None, "previous trailer not found")
  611. assert m is not None
  612. trailer_data = m.group(1)
  613. check_format_condition(
  614. int(m.group(2)) == xref_section_offset,
  615. "xref section offset in previous trailer doesn't match what was expected",
  616. )
  617. trailer_dict = self.interpret_trailer(trailer_data)
  618. if b"Prev" in trailer_dict:
  619. self.read_prev_trailer(trailer_dict[b"Prev"])
  620. re_whitespace_optional = re.compile(whitespace_optional)
  621. re_name = re.compile(
  622. whitespace_optional
  623. + rb"/([!-$&'*-.0-;=?-Z\\^-z|~]+)(?="
  624. + delimiter_or_ws
  625. + rb")"
  626. )
  627. re_dict_start = re.compile(whitespace_optional + rb"<<")
  628. re_dict_end = re.compile(whitespace_optional + rb">>" + whitespace_optional)
  629. @classmethod
  630. def interpret_trailer(cls, trailer_data: bytes) -> dict[bytes, Any]:
  631. trailer = {}
  632. offset = 0
  633. while True:
  634. m = cls.re_name.match(trailer_data, offset)
  635. if not m:
  636. m = cls.re_dict_end.match(trailer_data, offset)
  637. check_format_condition(
  638. m is not None and m.end() == len(trailer_data),
  639. "name not found in trailer, remaining data: "
  640. + repr(trailer_data[offset:]),
  641. )
  642. break
  643. key = cls.interpret_name(m.group(1))
  644. assert isinstance(key, bytes)
  645. value, value_offset = cls.get_value(trailer_data, m.end())
  646. trailer[key] = value
  647. if value_offset is None:
  648. break
  649. offset = value_offset
  650. check_format_condition(
  651. b"Size" in trailer and isinstance(trailer[b"Size"], int),
  652. "/Size not in trailer or not an integer",
  653. )
  654. check_format_condition(
  655. b"Root" in trailer and isinstance(trailer[b"Root"], IndirectReference),
  656. "/Root not in trailer or not an indirect reference",
  657. )
  658. return trailer
  659. re_hashes_in_name = re.compile(rb"([^#]*)(#([0-9a-fA-F]{2}))?")
  660. @classmethod
  661. def interpret_name(cls, raw: bytes, as_text: bool = False) -> str | bytes:
  662. name = b""
  663. for m in cls.re_hashes_in_name.finditer(raw):
  664. if m.group(3):
  665. name += m.group(1) + bytearray.fromhex(m.group(3).decode("us-ascii"))
  666. else:
  667. name += m.group(1)
  668. if as_text:
  669. return name.decode("utf-8")
  670. else:
  671. return bytes(name)
  672. re_null = re.compile(whitespace_optional + rb"null(?=" + delimiter_or_ws + rb")")
  673. re_true = re.compile(whitespace_optional + rb"true(?=" + delimiter_or_ws + rb")")
  674. re_false = re.compile(whitespace_optional + rb"false(?=" + delimiter_or_ws + rb")")
  675. re_int = re.compile(
  676. whitespace_optional + rb"([-+]?[0-9]+)(?=" + delimiter_or_ws + rb")"
  677. )
  678. re_real = re.compile(
  679. whitespace_optional
  680. + rb"([-+]?([0-9]+\.[0-9]*|[0-9]*\.[0-9]+))(?="
  681. + delimiter_or_ws
  682. + rb")"
  683. )
  684. re_array_start = re.compile(whitespace_optional + rb"\[")
  685. re_array_end = re.compile(whitespace_optional + rb"]")
  686. re_string_hex = re.compile(
  687. whitespace_optional + rb"<(" + whitespace_or_hex + rb"*)>"
  688. )
  689. re_string_lit = re.compile(whitespace_optional + rb"\(")
  690. re_indirect_reference = re.compile(
  691. whitespace_optional
  692. + rb"([-+]?[0-9]+)"
  693. + whitespace_mandatory
  694. + rb"([-+]?[0-9]+)"
  695. + whitespace_mandatory
  696. + rb"R(?="
  697. + delimiter_or_ws
  698. + rb")"
  699. )
  700. re_indirect_def_start = re.compile(
  701. whitespace_optional
  702. + rb"([-+]?[0-9]+)"
  703. + whitespace_mandatory
  704. + rb"([-+]?[0-9]+)"
  705. + whitespace_mandatory
  706. + rb"obj(?="
  707. + delimiter_or_ws
  708. + rb")"
  709. )
  710. re_indirect_def_end = re.compile(
  711. whitespace_optional + rb"endobj(?=" + delimiter_or_ws + rb")"
  712. )
  713. re_comment = re.compile(
  714. rb"(" + whitespace_optional + rb"%[^\r\n]*" + newline + rb")*"
  715. )
  716. re_stream_start = re.compile(whitespace_optional + rb"stream\r?\n")
  717. re_stream_end = re.compile(
  718. whitespace_optional + rb"endstream(?=" + delimiter_or_ws + rb")"
  719. )
  720. @classmethod
  721. def get_value(
  722. cls,
  723. data: bytes | bytearray | mmap.mmap,
  724. offset: int,
  725. expect_indirect: IndirectReference | None = None,
  726. max_nesting: int = -1,
  727. ) -> tuple[Any, int | None]:
  728. if max_nesting == 0:
  729. return None, None
  730. m = cls.re_comment.match(data, offset)
  731. if m:
  732. offset = m.end()
  733. m = cls.re_indirect_def_start.match(data, offset)
  734. if m:
  735. check_format_condition(
  736. int(m.group(1)) > 0,
  737. "indirect object definition: object ID must be greater than 0",
  738. )
  739. check_format_condition(
  740. int(m.group(2)) >= 0,
  741. "indirect object definition: generation must be non-negative",
  742. )
  743. check_format_condition(
  744. expect_indirect is None
  745. or expect_indirect
  746. == IndirectReference(int(m.group(1)), int(m.group(2))),
  747. "indirect object definition different than expected",
  748. )
  749. object, object_offset = cls.get_value(
  750. data, m.end(), max_nesting=max_nesting - 1
  751. )
  752. if object_offset is None:
  753. return object, None
  754. m = cls.re_indirect_def_end.match(data, object_offset)
  755. check_format_condition(
  756. m is not None, "indirect object definition end not found"
  757. )
  758. assert m is not None
  759. return object, m.end()
  760. check_format_condition(
  761. not expect_indirect, "indirect object definition not found"
  762. )
  763. m = cls.re_indirect_reference.match(data, offset)
  764. if m:
  765. check_format_condition(
  766. int(m.group(1)) > 0,
  767. "indirect object reference: object ID must be greater than 0",
  768. )
  769. check_format_condition(
  770. int(m.group(2)) >= 0,
  771. "indirect object reference: generation must be non-negative",
  772. )
  773. return IndirectReference(int(m.group(1)), int(m.group(2))), m.end()
  774. m = cls.re_dict_start.match(data, offset)
  775. if m:
  776. offset = m.end()
  777. result: dict[Any, Any] = {}
  778. m = cls.re_dict_end.match(data, offset)
  779. current_offset: int | None = offset
  780. while not m:
  781. assert current_offset is not None
  782. key, current_offset = cls.get_value(
  783. data, current_offset, max_nesting=max_nesting - 1
  784. )
  785. if current_offset is None:
  786. return result, None
  787. value, current_offset = cls.get_value(
  788. data, current_offset, max_nesting=max_nesting - 1
  789. )
  790. result[key] = value
  791. if current_offset is None:
  792. return result, None
  793. m = cls.re_dict_end.match(data, current_offset)
  794. current_offset = m.end()
  795. m = cls.re_stream_start.match(data, current_offset)
  796. if m:
  797. stream_len = result.get(b"Length")
  798. if stream_len is None or not isinstance(stream_len, int):
  799. msg = f"bad or missing Length in stream dict ({stream_len})"
  800. raise PdfFormatError(msg)
  801. stream_data = data[m.end() : m.end() + stream_len]
  802. m = cls.re_stream_end.match(data, m.end() + stream_len)
  803. check_format_condition(m is not None, "stream end not found")
  804. assert m is not None
  805. current_offset = m.end()
  806. return PdfStream(PdfDict(result), stream_data), current_offset
  807. return PdfDict(result), current_offset
  808. m = cls.re_array_start.match(data, offset)
  809. if m:
  810. offset = m.end()
  811. results = []
  812. m = cls.re_array_end.match(data, offset)
  813. current_offset = offset
  814. while not m:
  815. assert current_offset is not None
  816. value, current_offset = cls.get_value(
  817. data, current_offset, max_nesting=max_nesting - 1
  818. )
  819. results.append(value)
  820. if current_offset is None:
  821. return results, None
  822. m = cls.re_array_end.match(data, current_offset)
  823. return results, m.end()
  824. m = cls.re_null.match(data, offset)
  825. if m:
  826. return None, m.end()
  827. m = cls.re_true.match(data, offset)
  828. if m:
  829. return True, m.end()
  830. m = cls.re_false.match(data, offset)
  831. if m:
  832. return False, m.end()
  833. m = cls.re_name.match(data, offset)
  834. if m:
  835. return PdfName(cls.interpret_name(m.group(1))), m.end()
  836. m = cls.re_int.match(data, offset)
  837. if m:
  838. return int(m.group(1)), m.end()
  839. m = cls.re_real.match(data, offset)
  840. if m:
  841. # XXX Decimal instead of float???
  842. return float(m.group(1)), m.end()
  843. m = cls.re_string_hex.match(data, offset)
  844. if m:
  845. # filter out whitespace
  846. hex_string = bytearray(
  847. b for b in m.group(1) if b in b"0123456789abcdefABCDEF"
  848. )
  849. if len(hex_string) % 2 == 1:
  850. # append a 0 if the length is not even - yes, at the end
  851. hex_string.append(ord(b"0"))
  852. return bytearray.fromhex(hex_string.decode("us-ascii")), m.end()
  853. m = cls.re_string_lit.match(data, offset)
  854. if m:
  855. return cls.get_literal_string(data, m.end())
  856. # return None, offset # fallback (only for debugging)
  857. msg = f"unrecognized object: {repr(data[offset : offset + 32])}"
  858. raise PdfFormatError(msg)
  859. re_lit_str_token = re.compile(
  860. rb"(\\[nrtbf()\\])|(\\[0-9]{1,3})|(\\(\r\n|\r|\n))|(\r\n|\r|\n)|(\()|(\))"
  861. )
  862. escaped_chars = {
  863. b"n": b"\n",
  864. b"r": b"\r",
  865. b"t": b"\t",
  866. b"b": b"\b",
  867. b"f": b"\f",
  868. b"(": b"(",
  869. b")": b")",
  870. b"\\": b"\\",
  871. ord(b"n"): b"\n",
  872. ord(b"r"): b"\r",
  873. ord(b"t"): b"\t",
  874. ord(b"b"): b"\b",
  875. ord(b"f"): b"\f",
  876. ord(b"("): b"(",
  877. ord(b")"): b")",
  878. ord(b"\\"): b"\\",
  879. }
  880. @classmethod
  881. def get_literal_string(
  882. cls, data: bytes | bytearray | mmap.mmap, offset: int
  883. ) -> tuple[bytes, int]:
  884. nesting_depth = 0
  885. result = bytearray()
  886. for m in cls.re_lit_str_token.finditer(data, offset):
  887. result.extend(data[offset : m.start()])
  888. if m.group(1):
  889. result.extend(cls.escaped_chars[m.group(1)[1]])
  890. elif m.group(2):
  891. result.append(int(m.group(2)[1:], 8))
  892. elif m.group(3):
  893. pass
  894. elif m.group(5):
  895. result.extend(b"\n")
  896. elif m.group(6):
  897. result.extend(b"(")
  898. nesting_depth += 1
  899. elif m.group(7):
  900. if nesting_depth == 0:
  901. return bytes(result), m.end()
  902. result.extend(b")")
  903. nesting_depth -= 1
  904. offset = m.end()
  905. msg = "unfinished literal string"
  906. raise PdfFormatError(msg)
  907. re_xref_section_start = re.compile(whitespace_optional + rb"xref" + newline)
  908. re_xref_subsection_start = re.compile(
  909. whitespace_optional
  910. + rb"([0-9]+)"
  911. + whitespace_mandatory
  912. + rb"([0-9]+)"
  913. + whitespace_optional
  914. + newline_only
  915. )
  916. re_xref_entry = re.compile(rb"([0-9]{10}) ([0-9]{5}) ([fn])( \r| \n|\r\n)")
  917. def read_xref_table(self, xref_section_offset: int) -> int:
  918. assert self.buf is not None
  919. subsection_found = False
  920. m = self.re_xref_section_start.match(
  921. self.buf, xref_section_offset + self.start_offset
  922. )
  923. check_format_condition(m is not None, "xref section start not found")
  924. assert m is not None
  925. offset = m.end()
  926. while True:
  927. m = self.re_xref_subsection_start.match(self.buf, offset)
  928. if not m:
  929. check_format_condition(
  930. subsection_found, "xref subsection start not found"
  931. )
  932. break
  933. subsection_found = True
  934. offset = m.end()
  935. first_object = int(m.group(1))
  936. num_objects = int(m.group(2))
  937. for i in range(first_object, first_object + num_objects):
  938. m = self.re_xref_entry.match(self.buf, offset)
  939. check_format_condition(m is not None, "xref entry not found")
  940. assert m is not None
  941. offset = m.end()
  942. is_free = m.group(3) == b"f"
  943. if not is_free:
  944. generation = int(m.group(2))
  945. new_entry = (int(m.group(1)), generation)
  946. if i not in self.xref_table:
  947. self.xref_table[i] = new_entry
  948. return offset
  949. def read_indirect(self, ref: IndirectReference, max_nesting: int = -1) -> Any:
  950. offset, generation = self.xref_table[ref[0]]
  951. check_format_condition(
  952. generation == ref[1],
  953. f"expected to find generation {ref[1]} for object ID {ref[0]} in xref "
  954. f"table, instead found generation {generation} at offset {offset}",
  955. )
  956. assert self.buf is not None
  957. value = self.get_value(
  958. self.buf,
  959. offset + self.start_offset,
  960. expect_indirect=IndirectReference(*ref),
  961. max_nesting=max_nesting,
  962. )[0]
  963. self.cached_objects[ref] = value
  964. return value
  965. def linearize_page_tree(
  966. self, node: PdfDict | None = None
  967. ) -> list[IndirectReference]:
  968. page_node = node if node is not None else self.page_tree_root
  969. check_format_condition(
  970. page_node[b"Type"] == b"Pages", "/Type of page tree node is not /Pages"
  971. )
  972. pages = []
  973. for kid in page_node[b"Kids"]:
  974. kid_object = self.read_indirect(kid)
  975. if kid_object[b"Type"] == b"Page":
  976. pages.append(kid)
  977. else:
  978. pages.extend(self.linearize_page_tree(node=kid_object))
  979. return pages