Viewing file: PDFObject.php (47.34 KB) -rw-rw-rw- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
<?php
/** * @file * This file is part of the PdfParser library. * * @author Sébastien MALOT <sebastien@malot.fr> * * @date 2017-01-03 * * @license LGPLv3 * * @url <https://github.com/smalot/pdfparser> * * PdfParser is a pdf library written in PHP, extraction oriented. * Copyright (C) 2017 - Sébastien MALOT <sebastien@malot.fr> * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. * If not, see <http://www.pdfparser.org/sites/default/LICENSE.txt>. */
namespace Smalot\PdfParser;
use Smalot\PdfParser\Exception\InvalidDictionaryObjectException; use Smalot\PdfParser\XObject\Form; use Smalot\PdfParser\XObject\Image;
/** * Class PDFObject */ class PDFObject { public const TYPE = 't';
public const OPERATOR = 'o';
public const COMMAND = 'c';
/** * The recursion stack. * * @var array */ public static $recursionStack = [];
/** * @var Document|null */ protected $document;
/** * @var Header */ protected $header;
/** * @var string */ protected $content;
/** * @var Config|null */ protected $config;
/** * @var bool */ protected $addPositionWhitespace = false;
public function __construct( Document $document, ?Header $header = null, ?string $content = null, ?Config $config = null ) { $this->document = $document; $this->header = $header ?? new Header(); $this->content = $content; $this->config = $config; }
public function init() { }
public function getDocument(): Document { return $this->document; }
public function getHeader(): ?Header { return $this->header; }
public function getConfig(): ?Config { return $this->config; }
/** * @return Element|PDFObject|Header */ public function get(string $name) { return $this->header->get($name); }
public function has(string $name): bool { return $this->header->has($name); }
public function getDetails(bool $deep = true): array { return $this->header->getDetails($deep); }
public function getContent(): ?string { return $this->content; }
/** * Creates a duplicate of the document stream with * strings and other items replaced by $char. Formerly * getSectionsText() used this output to more easily gather offset * values to extract text from the *actual* document stream. * * @deprecated function is no longer used and will be removed in a future release * * @internal */ public function cleanContent(string $content, string $char = 'X') { $char = $char[0]; $content = str_replace(['\\\\', '\\)', '\\('], $char.$char, $content);
// Remove image bloc with binary content preg_match_all('/\s(BI\s.*?(\sID\s).*?(\sEI))\s/s', $content, $matches, \PREG_OFFSET_CAPTURE); foreach ($matches[0] as $part) { $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); }
// Clean content in square brackets [.....] preg_match_all('/\[((\(.*?\)|[0-9\.\-\s]*)*)\]/s', $content, $matches, \PREG_OFFSET_CAPTURE); foreach ($matches[1] as $part) { $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); }
// Clean content in round brackets (.....) preg_match_all('/\((.*?)\)/s', $content, $matches, \PREG_OFFSET_CAPTURE); foreach ($matches[1] as $part) { $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); }
// Clean structure if ($parts = preg_split('/(<|>)/s', $content, -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE)) { $content = ''; $level = 0; foreach ($parts as $part) { if ('<' == $part) { ++$level; }
$content .= (0 == $level ? $part : str_repeat($char, \strlen($part)));
if ('>' == $part) { --$level; } } }
// Clean BDC and EMC markup preg_match_all( '/(\/[A-Za-z0-9\_]*\s*'.preg_quote($char).'*BDC)/s', $content, $matches, \PREG_OFFSET_CAPTURE ); foreach ($matches[1] as $part) { $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); }
preg_match_all('/\s(EMC)\s/s', $content, $matches, \PREG_OFFSET_CAPTURE); foreach ($matches[1] as $part) { $content = substr_replace($content, str_repeat($char, \strlen($part[0])), $part[1], \strlen($part[0])); }
return $content; }
/** * Takes a string of PDF document stream text and formats * it into a multi-line string with one PDF command on each line, * separated by \r\n. If the given string is null, or binary data * is detected instead of a document stream then return an empty * string. */ private function formatContent(?string $content): string { if (null === $content) { return ''; }
// Outside of (String) and inline image content in PDF document // streams, all text should conform to UTF-8. Test for binary // content by deleting everything after the first open- // parenthesis ( which indicates the beginning of a string, or // the first ID command which indicates the beginning of binary // inline image content. Then test what remains for valid // UTF-8. If it's not UTF-8, return an empty string as this // $content is most likely binary. Unfortunately, using // mb_check_encoding(..., 'UTF-8') is not strict enough, so the // following regexp, adapted from the W3, is used. See: // https://www.w3.org/International/questions/qa-forms-utf-8.en // We use preg_replace() instead of preg_match() to avoid "JIT // stack limit exhausted" errors on larger files. $utf8Filter = preg_replace('/( [\x09\x0A\x0D\x20-\x7E] | # ASCII [\xC2-\xDF][\x80-\xBF] | # non-overlong 2-byte \xE0[\xA0-\xBF][\x80-\xBF] | # excluding overlongs [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} | # straight 3-byte \xED[\x80-\x9F][\x80-\xBF] | # excluding surrogates \xF0[\x90-\xBF][\x80-\xBF]{2} | # planes 1-3 [\xF1-\xF3][\x80-\xBF]{3} | # planes 4-15 \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 )/xs', '', preg_replace('/(\(|ID\s).*$/s', '', $content));
if ('' !== $utf8Filter) { return ''; }
// Find all inline image content and replace them so they aren't // affected by the next steps $pdfInlineImages = []; $offsetBI = 0; while (preg_match('/\sBI\s(\/.+?)\sID\s(.+?)\sEI(?=\s|$)/s', $content, $text, \PREG_OFFSET_CAPTURE, $offsetBI)) { // Attempt to detemine if this instance of the 'BI' command // actually occured within a (string) using the following // steps:
// Step 1: Remove any escaped slashes and parentheses from // the alleged image characteristics data $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[1][0]);
// Step 2: Remove all correctly ordered and balanced // parentheses from (strings) do { $paraTest = $para; $para = preg_replace('/\(([^()]*)\)/', '$1', $paraTest); } while ($para != $paraTest);
$paraOpen = strpos($para, '('); $paraClose = strpos($para, ')');
// Check: If the remaining text contains a close parenthesis // ')' AND it occurs before any open parenthesis, then we // are almost certain to be inside a (string) if (0 < $paraClose && (false === $paraOpen || $paraClose < $paraOpen)) { // Bump the search offset forward and match again $offsetBI = (int) $text[1][1]; continue; }
// Step 3: Double check that this is actually inline image // data by parsing the alleged image characteristics as a // dictionary $dict = $this->parseDictionary('<<'.$text[1][0].'>>');
// Check if an image Width and Height are set in the dict if ((isset($dict['W']) || isset($dict['Width'])) && (isset($dict['H']) || isset($dict['Height']))) { $id = uniqid('IMAGE_', true); $pdfInlineImages[$id] = [ preg_replace(['/\r\n/', '/\r/', '/\n/'], ' ', $text[1][0]), preg_replace(['/\r\n/', '/\r/', '/\n/'], '', $text[2][0]), ]; $content = preg_replace( '/'.preg_quote($text[0][0], '/').'/', '^^^'.$id.'^^^', $content, 1 ); } else { // If there was no valid dictionary, or a height and width // weren't specified, then we don't know what this is, so // just leave it alone; bump the search offset forward and // match again $offsetBI = (int) $text[1][1]; } }
// Find all strings () and replace them so they aren't affected // by the next steps $pdfstrings = []; $attempt = '('; while (preg_match('/'.preg_quote($attempt, '/').'.*?\)/s', $content, $text)) { // Remove all escaped slashes and parentheses from the target text $para = str_replace(['\\\\', '\\(', '\\)'], '', $text[0]);
// PDF strings can contain unescaped parentheses as long as // they're balanced, so check for balanced parentheses $left = preg_match_all('/\(/', $para); $right = preg_match_all('/\)/', $para);
if (')' == $para[-1] && $left == $right) { // Replace the string with a unique placeholder $id = uniqid('STRING_', true); $pdfstrings[$id] = $text[0]; $content = preg_replace( '/'.preg_quote($text[0], '/').'/', '@@@'.$id.'@@@', $content, 1 );
// Reset to search for the next string $attempt = '('; } else { // We had unbalanced parentheses, so use the current // match as a base to find a longer string $attempt = $text[0]; } }
// Remove all carriage returns and line-feeds from the document stream $content = str_replace(["\r", "\n"], ' ', trim($content));
// Find all dictionary << >> commands and replace them so they // aren't affected by the next steps $dictstore = []; while (preg_match('/(<<.*?>> *)(BDC|BMC|DP|MP)/s', $content, $dicttext)) { $dictid = uniqid('DICT_', true); $dictstore[$dictid] = $dicttext[1]; $content = preg_replace( '/'.preg_quote($dicttext[0], '/').'/', ' ###'.$dictid.'###'.$dicttext[2], $content, 1 ); }
// Normalize white-space in the document stream $content = preg_replace('/\s{2,}/', ' ', $content);
// Find all valid PDF operators and add \r\n after each; this // ensures there is just one command on every line // Source: https://ia801001.us.archive.org/1/items/pdf1.7/pdf_reference_1-7.pdf - Appendix A // Source: https://archive.org/download/pdf320002008/PDF32000_2008.pdf - Annex A // Note: PDF Reference 1.7 lists 'I' and 'rI' as valid commands, while // PDF 32000:2008 lists them as 'i' and 'ri' respectively. Both versions // appear here in the list for completeness. $operators = [ 'b*', 'b', 'BDC', 'BMC', 'B*', 'BI', 'BT', 'BX', 'B', 'cm', 'cs', 'c', 'CS', 'd0', 'd1', 'd', 'Do', 'DP', 'EMC', 'EI', 'ET', 'EX', 'f*', 'f', 'F', 'gs', 'g', 'G', 'h', 'i', 'ID', 'I', 'j', 'J', 'k', 'K', 'l', 'm', 'MP', 'M', 'n', 'q', 'Q', 're', 'rg', 'ri', 'rI', 'RG', 'scn', 'sc', 'sh', 's', 'SCN', 'SC', 'S', 'T*', 'Tc', 'Td', 'TD', 'Tf', 'TJ', 'Tj', 'TL', 'Tm', 'Tr', 'Ts', 'Tw', 'Tz', 'v', 'w', 'W*', 'W', 'y', '\'', '"', ]; foreach ($operators as $operator) { $content = preg_replace( '/(?<!\w|\/)'.preg_quote($operator, '/').'(?![\w10\*])/', $operator."\r\n", $content ); }
// Restore the original content of the dictionary << >> commands $dictstore = array_reverse($dictstore, true); foreach ($dictstore as $id => $dict) { $content = str_replace('###'.$id.'###', $dict, $content); }
// Restore the original string content $pdfstrings = array_reverse($pdfstrings, true); foreach ($pdfstrings as $id => $text) { // Strings may contain escaped newlines, or literal newlines // and we should clean these up before replacing the string // back into the content stream; this ensures no strings are // split between two lines (every command must be on one line) $text = str_replace( ["\\\r\n", "\\\r", "\\\n", "\r", "\n"], ['', '', '', '\r', '\n'], $text );
$content = str_replace('@@@'.$id.'@@@', $text, $content); }
// Restore the original content of any inline images $pdfInlineImages = array_reverse($pdfInlineImages, true); foreach ($pdfInlineImages as $id => $image) { $content = str_replace( '^^^'.$id.'^^^', "\r\nBI\r\n".$image[0]." ID\r\n".$image[1]." EI\r\n", $content ); }
$content = trim(preg_replace(['/(\r\n){2,}/', '/\r\n +/'], "\r\n", $content));
return $content; }
/** * getSectionsText() now takes an entire, unformatted * document stream as a string, cleans it, then filters out * commands that aren't needed for text positioning/extraction. It * returns an array of unprocessed PDF commands, one command per * element. * * @internal */ public function getSectionsText(?string $content): array { $sections = [];
// A cleaned stream has one command on every line, so split the // cleaned stream content on \r\n into an array $textCleaned = preg_split( '/(\r\n|\n|\r)/', $this->formatContent($content), -1, \PREG_SPLIT_NO_EMPTY );
$inTextBlock = false; foreach ($textCleaned as $line) { $line = trim($line);
// Skip empty lines if ('' === $line) { continue; }
// If a 'BT' is encountered, set the $inTextBlock flag if (preg_match('/BT$/', $line)) { $inTextBlock = true; $sections[] = $line;
// If an 'ET' is encountered, unset the $inTextBlock flag } elseif ('ET' == $line) { $inTextBlock = false; $sections[] = $line; } elseif ($inTextBlock) { // If we are inside a BT ... ET text block, save all lines $sections[] = trim($line); } else { // Otherwise, if we are outside of a text block, only // save specific, necessary lines. Care should be taken // to ensure a command being checked for *only* matches // that command. For instance, a simple search for 'c' // may also match the 'sc' command. See the command // list in the formatContent() method above. // Add more commands to save here as you find them in // weird PDFs! if ('q' == $line[-1] || 'Q' == $line[-1]) { // Save and restore graphics state commands $sections[] = $line; } elseif (preg_match('/(?<!\w)B[DM]C$/', $line)) { // Begin marked content sequence $sections[] = $line; } elseif (preg_match('/(?<!\w)[DM]P$/', $line)) { // Marked content point $sections[] = $line; } elseif (preg_match('/(?<!\w)EMC$/', $line)) { // End marked content sequence $sections[] = $line; } elseif (preg_match('/(?<!\w)cm$/', $line)) { // Graphics position change commands $sections[] = $line; } elseif (preg_match('/(?<!\w)Tf$/', $line)) { // Font change commands $sections[] = $line; } elseif (preg_match('/(?<!\w)Do$/', $line)) { // Invoke named XObject command $sections[] = $line; } } }
return $sections; }
private function getDefaultFont(?Page $page = null): Font { $fonts = []; if (null !== $page) { $fonts = $page->getFonts(); }
$firstFont = $this->document->getFirstFont(); if (null !== $firstFont) { $fonts[] = $firstFont; }
if (\count($fonts) > 0) { return reset($fonts); }
return new Font($this->document, null, null, $this->config); }
/** * Decode a '[]TJ' command and attempt to use alternate * fonts if the current font results in output that contains * Unicode control characters. * * @internal * * @param array<int,array<string,string|bool>> $command */ private function getTJUsingFontFallback(Font $font, array $command, ?Page $page = null, float $fontFactor = 4): string { $orig_text = $font->decodeText($command, $fontFactor); $text = $orig_text;
// If we make this a Config option, we can add a check if it's // enabled here. if (null !== $page) { $font_ids = array_keys($page->getFonts());
// If the decoded text contains UTF-8 control characters // then the font page being used is probably the wrong one. // Loop through the rest of the fonts to see if we can get // a good decode. Allow x09 to x0d which are whitespace. while (preg_match('/[\x00-\x08\x0e-\x1f\x7f]/u', $text) || false !== strpos(bin2hex($text), '00')) { // If we're out of font IDs, then give up and use the // original string if (0 == \count($font_ids)) { return $orig_text; }
// Try the next font ID $font = $page->getFont(array_shift($font_ids)); $text = $font->decodeText($command, $fontFactor); } }
return $text; }
/** * Expects a string that is a full PDF dictionary object, * including the outer enclosing << >> angle brackets * * @internal * * @throws InvalidDictionaryObjectException */ public function parseDictionary(string $dictionary): array { // Normalize whitespace $dictionary = preg_replace(['/\r/', '/\n/', '/\s{2,}/'], ' ', trim($dictionary));
if ('<<' != substr($dictionary, 0, 2)) { throw new InvalidDictionaryObjectException('Not a valid dictionary object.'); }
$parsed = []; $stack = []; $currentName = ''; $arrayTypeNumeric = false;
// Remove outer layer of dictionary, and split on tokens $split = preg_split( '/(<<|>>|\[|\]|\/[^\s\/\[\]\(\)<>]*)/', trim(preg_replace('/^<<|>>$/', '', $dictionary)), -1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE );
foreach ($split as $token) { $token = trim($token); switch ($token) { case '': break;
// Open numeric array case '[': $parsed[$currentName] = []; $arrayTypeNumeric = true;
// Move up one level in the stack $stack[\count($stack)] = &$parsed; $parsed = &$parsed[$currentName]; $currentName = ''; break;
// Open hashed array case '<<': $parsed[$currentName] = []; $arrayTypeNumeric = false;
// Move up one level in the stack $stack[\count($stack)] = &$parsed; $parsed = &$parsed[$currentName]; $currentName = ''; break;
// Close numeric array case ']': // Revert string type arrays back to a single element if (\is_array($parsed) && 1 == \count($parsed) && isset($parsed[0]) && \is_string($parsed[0]) && '' !== $parsed[0] && '/' != $parsed[0][0]) { $parsed = '['.$parsed[0].']'; } // Close hashed array // no break case '>>': $arrayTypeNumeric = false;
// Move down one level in the stack $parsed = &$stack[\count($stack) - 1]; unset($stack[\count($stack) - 1]); break;
default: // If value begins with a slash, then this is a name // Add it to the appropriate array if ('/' == substr($token, 0, 1)) { $currentName = substr($token, 1); if (true == $arrayTypeNumeric) { $parsed[] = $currentName; $currentName = ''; } } elseif ('' != $currentName) { if (false == $arrayTypeNumeric) { $parsed[$currentName] = $token; } $currentName = ''; } elseif ('' == $currentName) { $parsed[] = $token; } } }
return $parsed; }
/** * Returns the text content of a PDF as a string. Attempts to add * whitespace for spacing and line-breaks where appropriate. * * getText() leverages getTextArray() to get the content * of the document, setting the addPositionWhitespace flag to true * so whitespace is inserted in a logical way for reading by * humans. */ public function getText(?Page $page = null): string { $this->addPositionWhitespace = true; $result = $this->getTextArray($page); $this->addPositionWhitespace = false;
return implode('', $result).' '; }
/** * Returns the text content of a PDF as an array of strings. No * extra whitespace is inserted besides what is actually encoded in * the PDF text. * * @throws \Exception */ public function getTextArray(?Page $page = null): array { $result = []; $text = [];
$marked_stack = []; $last_written_position = false;
$sections = $this->getSectionsText($this->content); $current_font = $this->getDefaultFont($page); $current_font_size = 1; $current_text_leading = 0;
$current_position = ['x' => false, 'y' => false]; $current_position_tm = [ 'a' => 1, 'b' => 0, 'c' => 0, 'i' => 0, 'j' => 1, 'k' => 0, 'x' => 0, 'y' => 0, 'z' => 1, ]; $current_position_td = ['x' => 0, 'y' => 0]; $current_position_cm = [ 'a' => 1, 'b' => 0, 'c' => 0, 'i' => 0, 'j' => 1, 'k' => 0, 'x' => 0, 'y' => 0, 'z' => 1, ];
$clipped_font = []; $clipped_position_cm = [];
self::$recursionStack[] = $this->getUniqueId();
foreach ($sections as $section) { $commands = $this->getCommandsText($section); foreach ($commands as $command) { switch ($command[self::OPERATOR]) { // Begin text object case 'BT': // Reset text positioning matrices $current_position_tm = [ 'a' => 1, 'b' => 0, 'c' => 0, 'i' => 0, 'j' => 1, 'k' => 0, 'x' => 0, 'y' => 0, 'z' => 1, ]; $current_position_td = ['x' => 0, 'y' => 0]; $current_text_leading = 0; break;
// Begin marked content sequence with property list case 'BDC': if (preg_match('/(<<.*>>)$/', $command[self::COMMAND], $match)) { $dict = $this->parseDictionary($match[1]);
// Check for ActualText block if (isset($dict['ActualText']) && \is_string($dict['ActualText']) && '' !== $dict['ActualText']) { if ('[' == $dict['ActualText'][0]) { // Simulate a 'TJ' command on the stack $marked_stack[] = [ 'ActualText' => $this->getCommandsText($dict['ActualText'].'TJ')[0], ]; } elseif ('<' == $dict['ActualText'][0] || '(' == $dict['ActualText'][0]) { // Simulate a 'Tj' command on the stack $marked_stack[] = [ 'ActualText' => $this->getCommandsText($dict['ActualText'].'Tj')[0], ]; } } } break;
// Begin marked content sequence case 'BMC': if ('ReversedChars' == $command[self::COMMAND]) { // Upon encountering a ReversedChars command, // add the characters we've built up so far to // the result array $result = array_merge($result, $text);
// Start a fresh $text array that will contain // reversed characters $text = [];
// Add the reversed text flag to the stack $marked_stack[] = ['ReversedChars' => true]; } break;
// set graphics position matrix case 'cm': $args = preg_split('/\s+/s', $command[self::COMMAND]); $current_position_cm = [ 'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0, 'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0, 'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1, ]; break;
case 'Do': if (is_null($page)) { break; }
$args = preg_split('/\s/s', $command[self::COMMAND]); $id = trim(array_pop($args), '/ '); $xobject = $page->getXObject($id);
// Check we got a PDFObject back. if (!$xobject instanceof self) { break; }
// If the PDFObject is an image, do nothing, as images aren't text. if ($xobject instanceof Image) { break; }
// Check this is not a circular reference. if (!\in_array($xobject->getUniqueId(), self::$recursionStack, true)) { $text[] = $xobject->getText($page); } break;
// Marked content point with (DP) & without (MP) property list case 'DP': case 'MP': break;
// End text object case 'ET': break;
// Store current selected font and graphics matrix case 'q': $clipped_font[] = [$current_font, $current_font_size]; $clipped_position_cm[] = $current_position_cm; break;
// Restore previous selected font and graphics matrix case 'Q': list($current_font, $current_font_size) = array_pop($clipped_font); $current_position_cm = array_pop($clipped_position_cm); break;
// End marked content sequence case 'EMC': $data = false; if (\count($marked_stack)) { $marked = array_pop($marked_stack); $action = key($marked); $data = $marked[$action];
switch ($action) { // If we are in ReversedChars mode... case 'ReversedChars': // Reverse the characters we've built up so far foreach ($text as $key => $t) { $text[$key] = implode('', array_reverse( mb_str_split($t, 1, mb_internal_encoding()) )); }
// Add these characters to the result array $result = array_merge($result, $text);
// Start a fresh $text array that will contain // non-reversed characters $text = []; break;
case 'ActualText': // Use the content of the ActualText as a command $command = $data; break; } }
// If this EMC command has been transformed into a 'Tj' // or 'TJ' command because of being ActualText, then bypass // the break to proceed to the writing section below. if ('Tj' != $command[self::OPERATOR] && 'TJ' != $command[self::OPERATOR]) { break; }
// no break case "'": case '"': if ("'" == $command[self::OPERATOR] || '"' == $command[self::OPERATOR]) { // Move to next line and write text $current_position['x'] = 0; $current_position_td['x'] = 0; $current_position_td['y'] += $current_text_leading; } // no break case 'Tj': $command[self::COMMAND] = [$command]; // no break case 'TJ': // Check the marked content stack for flags $actual_text = false; $reverse_text = false; foreach ($marked_stack as $marked) { if (isset($marked['ActualText'])) { $actual_text = true; } if (isset($marked['ReversedChars'])) { $reverse_text = true; } }
// Account for text position ONLY just before we write text if (false === $actual_text && \is_array($last_written_position)) { // If $last_written_position is an array, that // means we have stored text position coordinates // for placing an ActualText $currentX = $last_written_position[0]; $currentY = $last_written_position[1]; $last_written_position = false; } else { $currentX = $current_position_cm['x'] + $current_position_tm['x'] + $current_position_td['x']; $currentY = $current_position_cm['y'] + $current_position_tm['y'] + $current_position_td['y']; } $whiteSpace = '';
$factorX = -$current_font_size * $current_position_tm['a'] - $current_font_size * $current_position_tm['i']; $factorY = $current_font_size * $current_position_tm['b'] + $current_font_size * $current_position_tm['j'];
if (true === $this->addPositionWhitespace && false !== $current_position['x']) { $curY = $currentY - $current_position['y']; if (abs($curY) >= abs($factorY) / 4) { $whiteSpace = "\n"; } else { if (true === $reverse_text) { $curX = $current_position['x'] - $currentX; } else { $curX = $currentX - $current_position['x']; }
// In abs($factorX * 7) below, the 7 is chosen arbitrarily // as the number of apparent "spaces" in a document we // would need before considering them a "tab". In the // future, we might offer this value to users as a config // option. if ($curX >= abs($factorX * 7)) { $whiteSpace = "\t"; } elseif ($curX >= abs($factorX * 2)) { $whiteSpace = ' '; } } }
$newtext = $this->getTJUsingFontFallback( $current_font, $command[self::COMMAND], $page, $factorX );
// If there is no ActualText pending then write if (false === $actual_text) { $newtext = str_replace(["\r", "\n"], '', $newtext); if (false !== $reverse_text) { // If we are in ReversedChars mode, add the whitespace last $text[] = preg_replace('/ $/', ' ', $newtext.$whiteSpace); } else { // Otherwise add the whitespace first if (' ' === $whiteSpace && isset($text[\count($text) - 1])) { $text[\count($text) - 1] = preg_replace('/ $/', '', $text[\count($text) - 1]); } $text[] = preg_replace('/^[ \t]{2}/', ' ', $whiteSpace.$newtext); }
// Record the position of this inserted text for comparison // with the next text block. // Provide a 'fudge' factor guess on how wide this text block // is based on the number of characters. This helps limit the // number of tabs inserted, but isn't perfect. $factor = $factorX / 2; $current_position = [ 'x' => $currentX - mb_strlen($newtext) * $factor, 'y' => $currentY, ]; } elseif (false === $last_written_position) { // If there is an ActualText in the pipeline // store the position this undisplayed text // *would* have been written to, so the // ActualText is displayed in the right spot $last_written_position = [$currentX, $currentY]; $current_position['x'] = $currentX; } break;
// move to start of next line case 'T*': $current_position['x'] = 0; $current_position_td['x'] = 0; $current_position_td['y'] += $current_text_leading; break;
// set character spacing case 'Tc': break;
// move text current point and set leading case 'Td': case 'TD': // move text current point $args = preg_split('/\s+/s', $command[self::COMMAND]); $y = (float) array_pop($args); $x = (float) array_pop($args);
if ('TD' == $command[self::OPERATOR]) { $current_text_leading = -$y * $current_position_tm['b'] - $y * $current_position_tm['j']; }
$current_position_td = [ 'x' => $current_position_td['x'] + $x * $current_position_tm['a'] + $x * $current_position_tm['i'], 'y' => $current_position_td['y'] + $y * $current_position_tm['b'] + $y * $current_position_tm['j'], ]; break;
case 'Tf': $args = preg_split('/\s/s', $command[self::COMMAND]); $size = (float) array_pop($args); $id = trim(array_pop($args), '/'); if (null !== $page) { $new_font = $page->getFont($id); // If an invalid font ID is given, do not update the font. // This should theoretically never happen, as the PDF spec states for the Tf operator: // "The specified font value shall match a resource name in the Font entry of the default resource dictionary" // (https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf, page 435) // But we want to make sure that malformed PDFs do not simply crash. if (null !== $new_font) { $current_font = $new_font; $current_font_size = $size; } } break;
// set leading case 'TL': $y = (float) $command[self::COMMAND]; $current_text_leading = -$y * $current_position_tm['b'] + -$y * $current_position_tm['j']; break;
// set text position matrix case 'Tm': $args = preg_split('/\s+/s', $command[self::COMMAND]); $current_position_tm = [ 'a' => (float) $args[0], 'b' => (float) $args[1], 'c' => 0, 'i' => (float) $args[2], 'j' => (float) $args[3], 'k' => 0, 'x' => (float) $args[4], 'y' => (float) $args[5], 'z' => 1, ]; break;
// set text rendering mode case 'Ts': break;
// set super/subscripting text rise case 'Ts': break;
// set word spacing case 'Tw': break;
// set horizontal scaling case 'Tz': break;
default: } } }
$result = array_merge($result, $text);
return $result; }
/** * getCommandsText() expects the content of $text_part to be an * already formatted, single-line command from a document stream. * The companion function getSectionsText() returns a document * stream as an array of single commands for just this purpose. * Because of this, the argument $offset is no longer used, and * may be removed in a future PdfParser release. * * A better name for this function would be getCommandText() * since it now always works on just one command. */ public function getCommandsText(string $text_part, int &$offset = 0): array { $commands = $matches = [];
preg_match('/^(([\/\[\(<])?.*)(?<!\w)([a-z01\'\"*]+)$/i', $text_part, $matches);
// If no valid command is detected, return an empty array if (!isset($matches[1]) || !isset($matches[2]) || !isset($matches[3])) { return []; }
$type = $matches[2]; $operator = $matches[3]; $command = trim($matches[1]);
if ('TJ' == $operator) { $subcommand = []; $command = trim($command, '[]'); do { $oldCommand = $command;
// Search for parentheses string () format if (preg_match('/^ *\((.*?)(?<![^\\\\]\\\\)\) *(-?[\d.]+)?/', $command, $tjmatch)) { $subcommand[] = [ self::TYPE => '(', self::OPERATOR => 'TJ', self::COMMAND => $tjmatch[1], ]; if (isset($tjmatch[2]) && trim($tjmatch[2])) { $subcommand[] = [ self::TYPE => 'n', self::OPERATOR => '', self::COMMAND => $tjmatch[2], ]; } $command = substr($command, \strlen($tjmatch[0])); }
// Search for hexadecimal <> format if (preg_match('/^ *<([0-9a-f\s]*)> *(-?[\d.]+)?/i', $command, $tjmatch)) { $tjmatch[1] = preg_replace('/\s/', '', $tjmatch[1]); $subcommand[] = [ self::TYPE => '<', self::OPERATOR => 'TJ', self::COMMAND => $tjmatch[1], ]; if (isset($tjmatch[2]) && trim($tjmatch[2])) { $subcommand[] = [ self::TYPE => 'n', self::OPERATOR => '', self::COMMAND => $tjmatch[2], ]; } $command = substr($command, \strlen($tjmatch[0])); } } while ($command != $oldCommand);
$command = $subcommand; } elseif ('Tj' == $operator || "'" == $operator || '"' == $operator) { // Depending on the string type, trim the data of the // appropriate delimiters if ('(' == $type) { // Don't use trim() here since a () string may end with // a balanced or escaped right parentheses, and trim() // will delete both. Both strings below are valid: // eg. (String()) // eg. (String\)) $command = preg_replace('/^\(|\)$/', '', $command); } elseif ('<' == $type) { $command = trim($command, '<>'); } } elseif ('/' == $type) { $command = substr($command, 1); }
$commands[] = [ self::TYPE => $type, self::OPERATOR => $operator, self::COMMAND => $command, ];
return $commands; }
public static function factory( Document $document, Header $header, ?string $content, ?Config $config = null ): self { switch ($header->get('Type')->getContent()) { case 'XObject': switch ($header->get('Subtype')->getContent()) { case 'Image': return new Image($document, $header, $config->getRetainImageContent() ? $content : null, $config);
case 'Form': return new Form($document, $header, $content, $config); }
return new self($document, $header, $content, $config);
case 'Pages': return new Pages($document, $header, $content, $config);
case 'Page': return new Page($document, $header, $content, $config);
case 'Encoding': return new Encoding($document, $header, $content, $config);
case 'Font': $subtype = $header->get('Subtype')->getContent(); $classname = '\Smalot\PdfParser\Font\Font'.$subtype;
if (class_exists($classname)) { return new $classname($document, $header, $content, $config); }
return new Font($document, $header, $content, $config);
default: return new self($document, $header, $content, $config); } }
/** * Returns unique id identifying the object. */ protected function getUniqueId(): string { return spl_object_hash($this); } }
|