$object_id, 'generation' => (integer) $matches[2], 'start' => (integer) $matches[1] ); } } $object_id++; $offset += 20; } else { break; } } } /** * Parse a cross-reference table section into the array of indirect object definitions * * Creates the array of indirect object offsets and lengths * @since 2.10 * * @param string full path and file name * @param integer offset within the file of the xref id and count entry * * @return integer length of the section */ private static function _parse_pdf_xref_section( $file_name, $file_offset ) { $xref_max = $chunksize = 16384; $xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize ); $xref_length = 0; while ( preg_match( '/^[\x00-\x20]*(\d+) (\d+)[\x00-\x20]*/', substr($xref_section, $xref_length), $matches, 0 ) ) { $object_id = $matches[1]; $count = $matches[2]; $offset = $xref_length + strlen( $matches[0] ); $xref_length = $offset + ( 20 * $count ); if ( $xref_max < $xref_length ) { $xref_max += $chunksize; $xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $xref_max ); } self::_parse_pdf_xref_subsection( $xref_section, $offset, $object_id, $count ); } // while preg_match subsection header return $xref_length; } /** * Parse a cross-reference steam into the array of indirect object definitions * * Creates the array of indirect object offsets and lengths * @since 2.10 * * @param string full path and file name * @param integer offset within the file of the xref id and count entry * @param string "/W" entry, representing the size of the fields in a single entry * * @return integer length of the stream */ private static function _parse_pdf_xref_stream( $file_name, $file_offset, $entry_parms_string ) { $chunksize = 16384; $xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize ); if ( 'stream' == substr( $xref_section, 0, 6 ) ) { $tag_length = 7; if ( chr(0x0D) == $xref_section[6] ) { $tag_length++; } } else { return 0; } /* * If necessary and possible, expand the $xref_section until it contains the end tag */ $new_chunksize = $chunksize; if ( false === ( $end_tag = strpos( $xref_section, 'endstream', $tag_length ) ) && ( $chunksize == strlen( $xref_section ) ) ) { $new_chunksize = $chunksize + $chunksize; $xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $new_chunksize ); while ( false === ( $end_tag = strpos( $xref_section, 'endstream' ) ) && ( $new_chunksize == strlen( $xref_section ) ) ) { $new_chunksize = $new_chunksize + $chunksize; $xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $new_chunksize ); } // while not found } // if not found if ( false == $end_tag ) { $length = 0; } else { $length = $end_tag - $tag_length; } if ( false == $end_tag ) { return 0; } return $length; $entry_parms = explode( ' ', $entry_parms_string ); $object_id = $matches[1]; $count = $matches[2]; $offset = strlen( $matches[0] ); $length = $offset + ( 20 * $count ); if ( $chunksize < $length ) { $xref_section = file_get_contents( $file_name, true, NULL, $file_offset, $length ); $offset = 0; } while ( $count-- ) { $match_count = preg_match( '/(\d+) (\d+) (.)/', $xref_section, $matches, 0, $offset); if ( $match_count ) { if ( 'n' == $matches[3] ) { $key = ( $object_id * 1000 ) + $matches[2]; if ( ! isset( self::$pdf_indirect_objects[ $key ] ) ) { self::$pdf_indirect_objects[ $key ] = array( 'number' => $object_id, 'generation' => (integer) $matches[2], 'start' => (integer) $matches[1] ); } } $object_id++; $offset += 20; } else { break; } } return $length; } /** * Build an array of indirect object definitions * * Creates the array of indirect object offsets and lengths * @since 2.10 * * @param string The entire PDF document, passsed by reference * * @return void */ private static function _build_pdf_indirect_objects( &$string ) { if ( ! is_null( self::$pdf_indirect_objects ) ) { return; } $match_count = preg_match_all( '!(\d+)\\h+(\d+)\\h+obj|endobj|stream(\x0D\x0A|\x0A)|endstream!', $string, $matches, PREG_OFFSET_CAPTURE ); self::$pdf_indirect_objects = array(); $object_level = 0; $is_stream = false; for ( $index = 0; $index < $match_count; $index++ ) { if ( $is_stream ) { if ( 'endstream' == substr( $matches[0][ $index ][0], 0, 9 ) ) { $is_stream = false; } } elseif ( 'endobj' == substr( $matches[0][ $index ][0], 0, 6 ) ) { $object_level--; $object_entry['/length'] = $matches[0][ $index ][1] - $object_entry['start']; self::$pdf_indirect_objects[ ($object_entry['number'] * 1000) + $object_entry['generation'] ] = $object_entry; } elseif ( 'obj' == substr( $matches[0][ $index ][0], -3 ) ) { $object_level++; $object_entry = array( 'number' => $matches[1][ $index ][0], 'generation' => $matches[2][ $index ][0], 'start' => $matches[0][ $index ][1] + strlen( $matches[0][ $index ][0] ) ); } elseif ( 'stream' == substr( $matches[0][ $index ][0], 0, 6 ) ) { $is_stream = true; } else { /* translators: 1: ERROR tag 2: index */ MLACore::mla_debug_add( sprintf( _x( '%1$s: _build_pdf_indirect_objects bad value at $index = "%2$d".', 'error_log', 'media-library-assistant' ), __( 'ERROR', 'media-library-assistant' ), $index ), MLACore::MLA_DEBUG_CATEGORY_ANY ); } } // for each match } /** * Find the offset, length and contents of an indirect object containing a dictionary * * The function searches the entire file, if necessary, to find the last/most recent copy of the object. * This is required because Adobe Acrobat does NOT increment the generation number when it reuses an object. * * @since 2.10 * * @param string full path and file name * @param integer The object number * @param integer The object generation number; default zero (0) * @param integer The desired object instance (when multiple instances are present); default "highest/latest" * * @return mixed NULL on failure else array( 'start' => offset in the file, 'length' => object length, 'content' => dictionary contents ) */ private static function _find_pdf_indirect_dictionary( $file_name, $object, $generation = 0, $instance = NULL ) { $chunksize = 16384; $key = ( $object * 1000 ) + $generation; if ( isset( self::$pdf_indirect_objects ) && isset( self::$pdf_indirect_objects[ $key ] ) ) { $file_offset = self::$pdf_indirect_objects[ $key ]['start']; } else { // found object location $file_offset = 0; } $object_starts = array(); $object_content = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize ); //error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$file_name}, {$file_offset} ) object_content = \r\n" . MLAData::mla_hex_dump( $object_content ), 0 ); /* * Match the object header */ $pattern = sprintf( '!%1$d\\h+%2$d\\h+obj[\\x00-\\x20]*(<<)!', $object, $generation ); //error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$object}, {$generation} ) pattern = " . var_export( $pattern, true ), 0 ); $match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE ); //error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$match_count} ) matches = " . var_export( $matches, true ), 0 ); if ( $match_count ) { $object_starts[] = array( 'offset' => $file_offset, 'start' => $matches[1][1]); //error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$file_offset}, {$matches[1][1]} ) object_content = \r\n" . MLAData::mla_hex_dump( substr( $object_content, $matches[1][1] ), 512 ), 0 ); $match_count = 0; } /* * If necessary and possible, advance the $object_content through the file until it contains the start tag */ if ( 0 == $match_count && ( $chunksize == strlen( $object_content ) ) ) { $file_offset += ( $chunksize - 16 ); $object_content = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize ); $match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE ); //error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$match_count} ) matches = " . var_export( $matches, true ), 0 ); if ( $match_count ) { $object_starts[] = array( 'offset' => $file_offset, 'start' => $matches[1][1]); //error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$file_offset}, {$matches[1][1]} ) object_content = \r\n" . MLAData::mla_hex_dump( substr( $object_content, $matches[1][1] ), 512 ), 0 ); $match_count = 0; } while ( 0 == $match_count && ( $chunksize == strlen( $object_content ) ) ) { $file_offset += ( $chunksize - 16 ); $object_content = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize ); $match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE ); //error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$match_count} ) matches = " . var_export( $matches, true ), 0 ); if ( $match_count ) { $object_starts[] = array( 'offset' => $file_offset, 'start' => $matches[1][1]); //error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary( {$file_offset}, {$matches[1][1]} ) object_content = \r\n" . MLAData::mla_hex_dump( substr( $object_content, $matches[1][1] ), 512 ), 0 ); $match_count = 0; } } // while not found } // if not found //error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary object_starts = " . var_export( $object_starts, true ), 0 ); /* * Return the highest/latest instance unless a specific instance is requested */ $object_count = count( $object_starts ); if ( is_null( $instance ) ) { $object_start = array_pop( $object_starts ); } else { $instance = absint( $instance ); $object_start = isset( $object_starts[ $instance ] ) ? $object_starts[ $instance ] : NULL; } if ( is_null( $object_start ) ) { return NULL; } else { $file_offset = $object_start['offset']; $object_content = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize ); $start = $object_start['start']; } /* * If necessary and possible, expand the $object_content until it contains the end tag */ $pattern = '!>>[\\x00-\\x20]*[endobj|stream]!'; $match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE, $start ); if ( 0 == $match_count && ( $chunksize == strlen( $object_content ) ) ) { $file_offset = $file_offset + $start; $start = 0; $new_chunksize = $chunksize + $chunksize; $object_content = file_get_contents( $file_name, true, NULL, $file_offset, $new_chunksize ); $match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE, $start ); while ( 0 == $match_count && ( $new_chunksize == strlen( $object_content ) ) ) { $new_chunksize = $new_chunksize + $chunksize; $object_content = file_get_contents( $file_name, true, NULL, $file_offset, $new_chunksize ); $match_count = preg_match( $pattern, $object_content, $matches, PREG_OFFSET_CAPTURE, $start ); } // while not found } // if not found if ( 0 == $match_count ) { return NULL; } if ($match_count) { $results = array( 'count' => $object_count, 'start' => $file_offset + $start, 'length' => ($matches[0][1] + 2) - $start ); $results['content'] = substr( $object_content, $start, $results['length'] ); //error_log( __LINE__ . " MLAPDF::_find_pdf_indirect_dictionary results = " . var_export( $results, true ), 0 ); return $results; } // found trailer return NULL; } /** * Parse a PDF Unicode (16-bit Big Endian) object * * @since 2.10 * * @param string PDF string of 16-bit characters * * @return string UTF-8 encoded string */ private static function _parse_pdf_UTF16BE( &$source_string ) { $output = ''; for ($index = 2; $index < strlen( $source_string ); ) { $value = ( ord( $source_string[ $index++ ] ) << 8 ) + ord( $source_string[ $index++ ] ); if ( $value < 0x80 ) { $output .= chr( $value ); } elseif ( $value < 0x100 ) { $output .= MLAData::$utf8_chars[ $value - 0x80 ]; } else { $output .= '.'; // TODO encode the rest } } return $output; } /** * Parse a PDF string object * * Returns an array with one dictionary entry. The array also has a '/length' element containing * the number of bytes occupied by the string in the source string, including the enclosing parentheses. * * @since 2.10 * * @param string data within which the string occurs * @param integer offset within the source string of the opening '(' character. * * @return array ( key => array( 'type' => type, 'value' => value, '/length' => length ) ) for the string */ private static function _parse_pdf_string( &$source_string, $offset ) { if ( '(' != $source_string[ $offset ] ) { return array( 'type' => 'unknown', 'value' => '', '/length' => 0 ); } /* * Brute force, here we come... */ $output = ''; $level = 0; $in_string = true; $index = $offset + 1; while ( $in_string ) { $byte = $source_string[ $index++ ]; if ( '\\' == $byte ) { switch ( $source_string[ $index ] ) { case chr( 0x0A ): if ( chr( 0x0D ) == $source_string[ $index + 1 ] ) { $index++; } break; case chr( 0x0D ): if ( chr( 0x0A ) == $source_string[ $index + 1 ] ) { $index++; } break; case 'n': $output .= chr( 0x0A ); break; case 'r': $output .= chr( 0x0D ); break; case 't': $output .= chr( 0x09 ); break; case 'b': $output .= chr( 0x08 ); break; case 'f': $output .= chr( 0x0C ); break; default: // could be a 1- to 3-digit octal value $digit_limit = $index + 3; $digit_index = $index; while ( $digit_index < $digit_limit ) { if ( ! ctype_digit( $source_string[ $digit_index ] ) ) { break; } else { $digit_index++; } } if ( $digit_count = $digit_index - $index ) { $output .= chr( octdec( substr( $source_string, $index, $digit_count ) ) ); $index += $digit_count - 1; } else { // accept the character following the backslash $output .= $source_string[ $index ]; } } // switch $index++; } else { // REVERSE SOLIDUS if ( '(' == $byte ) { $level++; } elseif ( ')' == $byte ) { if ( 0 == $level-- ) { $in_string = false; continue; } } $output .= $byte; } // just another 8-bit value, but check for balanced parentheses } // $in_string return array( 'type' => 'string', 'value' => $output, '/length' => $index - $offset ); } /** * Parse a PDF Linearization Parameter Dictionary object * * Returns an array of dictionary contents, classified by object type: boolean, numeric, string, hex (string), * indirect (object), name, array, dictionary, stream, and null. * The array also has a '/length' element containing the number of bytes occupied by the * dictionary in the source string, excluding the enclosing delimiters, if passed in. * @since 2.10 * * @param string data within which the object occurs, typically the start of a PDF document * @param integer filesize of the PDF document, for validation purposes, or zero (0) to ignore filesize * * @return mixed array of dictionary objects on success, false on failure */ private static function _parse_pdf_LPD_dictionary( &$source_string, $filesize ) { $header = substr( $source_string, 0, 1024 ); $match_count = preg_match( '!obj[\x00-\x20]*<<(/Linearized).*(>>)[\x00-\x20]*endobj!', $header, $matches, PREG_OFFSET_CAPTURE ); if ( $match_count ) { $LPD = self::_parse_pdf_dictionary( $header, $matches[1][1] ); } return false; } /** * Parse a PDF dictionary object * * Returns an array of dictionary contents, classified by object type: boolean, numeric, string, hex (string), * indirect (object), name, array, dictionary, stream, and null. * The array also has a '/length' element containing the number of bytes occupied by the * dictionary in the source string, excluding the enclosing delimiters. * * @since 2.10 * * @param string data within which the string occurs * @param integer offset within the source string of the opening '<<' characters or the first content character. * * @return array ( '/length' => length, key => array( 'type' => type, 'value' => value ) ) for each dictionary field */ private static function _parse_pdf_dictionary( &$source_string, $offset ) { /* * Find the end of the dictionary */ if ( '<<' == substr( $source_string, $offset, 2 ) ) { $nest = $offset + 2; } else { $nest = $offset; } $level = 1; do { $dictionary_end = strpos( $source_string, '>>', $nest ); if ( false === $dictionary_end ) { /* translators: 1: ERROR tag 2: source offset 3: nest level */ MLACore::mla_debug_add( sprintf( _x( '%1$s: _parse_pdf_dictionary offset = %2$d, nest = %3$d.', 'error_log', 'media-library-assistant' ), __( 'ERROR', 'media-library-assistant' ), $offset, $nest ), MLACore::MLA_DEBUG_CATEGORY_ANY ); /* translators: 1: ERROR tag 2: dictionary excerpt */ MLACore::mla_debug_add( sprintf( _x( '%1$s: _parse_pdf_dictionary no end delimiter dump = %2$s.', 'error_log', 'media-library-assistant' ), __( 'ERROR', 'media-library-assistant' ), MLAData::mla_hex_dump( substr( $source_string, $offset, 128 ), 128, 16 ) ), MLACore::MLA_DEBUG_CATEGORY_ANY ); return array( '/length' => 0 ); } $nest = strpos( $source_string, '<<', $nest ); if ( false === $nest ) { $nest = $dictionary_end + 2; $level--; } elseif ( $nest < $dictionary_end ) { $nest += 2; $level++; } else { $nest = $dictionary_end + 2; $level--; } } while ( $level ); $dictionary_length = $dictionary_end + 2 - $offset; $dictionary = array(); // \x00-\x20 for whitespace // \(|\)|\<|\>|\[|\]|\{|\}|\/|\% for delimiters $match_count = preg_match_all( '!/([^\x00-\x20|\(|\)|\<|\>|\[|\]|\{|\}|\/|\%]*)([\x00-\x20]*)!', substr( $source_string, $offset, $dictionary_length ), $matches, PREG_OFFSET_CAPTURE ); $end_data = -1; for ( $match_index = 0; $match_index < $match_count; $match_index++ ) { $name = $matches[1][ $match_index ][0]; $value_start = $offset + $matches[2][ $match_index ][1] + strlen( $matches[2][ $match_index ][0] ); /* * Skip over false matches within a string or nested dictionary */ if ( $value_start < $end_data ) { continue; } $end_data = -1; $value_count = preg_match( '!(\/?[^\/\x0D\x0A]*)!', substr( $source_string, $value_start, ($dictionary_end - $value_start ) ), $value_matches, PREG_OFFSET_CAPTURE ); if ( 1 == $value_count ) { $value = trim( $value_matches[0][0] ); $length = strlen( $value ); $dictionary[ $name ]['value'] = $value; if ( ! isset( $value[0] ) ) { /* translators: 1: ERROR tag 2: entry name 3: value excerpt */ MLACore::mla_debug_add( sprintf( _x( '%1$s: _parse_pdf_dictionary bad value [ %2$s ] dump = %3$s', 'error_log', 'media-library-assistant' ), __( 'ERROR', 'media-library-assistant' ), $name, MLAData::mla_hex_dump( $value, 32, 16 ) ), MLACore::MLA_DEBUG_CATEGORY_ANY ); continue; } if ( in_array( $value, array( 'true', 'false' ) ) ) { $dictionary[ $name ]['type'] = 'boolean'; } elseif ( is_numeric( $value ) ) { $dictionary[ $name ]['type'] = 'numeric'; } elseif ( '(' == $value[0] ) { $dictionary[ $name ] = self::_parse_pdf_string( $source_string, $value_start ); $end_data = $value_start + $dictionary[ $name ]['/length']; unset( $dictionary[ $name ]['/length'] ); } elseif ( '<' == $value[0] ) { if ( '<' == $value[1] ) { $dictionary[ $name ]['value'] = self::_parse_pdf_dictionary( $source_string, $value_start ); $dictionary[ $name ]['type'] = 'dictionary'; $end_data = $value_start + 4 + $dictionary[ $name ]['value']['/length']; unset( $dictionary[ $name ]['value']['/length'] ); } else { $dictionary[ $name ]['type'] = 'hex'; } } elseif ( '/' == $value[0] ) { $dictionary[ $name ]['value'] = substr( $value, 1 ); $dictionary[ $name ]['type'] = 'name'; $match_index++; // Skip to the next key } elseif ( '[' == $value[0] ) { $dictionary[ $name ]['type'] = 'array'; $array_length = strpos( $source_string, ']', $value_start ) - ($value_start + 1); $dictionary[ $name ]['value'] = substr( $source_string, $value_start + 1, $array_length ); $end_data = 2 + $value_start + $array_length; } elseif ( 'null' == $value ) { $dictionary[ $name ]['type'] = 'null'; } elseif ( 'stream' == substr( $value, 0, 6 ) ) { $dictionary[ $name ]['type'] = 'stream'; } else { $object_count = preg_match( '!(\d+)\h+(\d+)\h+R!', $value, $object_matches ); if ( 1 == $object_count ) { $dictionary[ $name ]['type'] = 'indirect'; $dictionary[ $name ]['object'] = $object_matches[1]; $dictionary[ $name ]['generation'] = $object_matches[2]; } else { $dictionary[ $name ]['type'] = 'unknown'; } } } else { $dictionary[ $matches[1][ $match_index ][0] ] = array( 'value' => '' ); $dictionary[ $matches[1][ $match_index ][0] ]['type'] = 'nomatch'; } } // foreach match $dictionary['/length'] = $dictionary_length; return $dictionary; } /** * Extract dictionary from traditional cross-reference + trailer documents * * @since 2.10 * * @param string full path to the desired file * @param integer offset within file of the cross-reference table * * @return mixed array of "PDF dictionary arrays", newest first, or NULL on failure */ private static function _extract_pdf_trailer( $file_name, $file_offset ) { $chunksize = 16384; $tail = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize ); $chunk_offset = 0; /* * look for traditional xref and trailer */ if ( 'xref' == substr( $tail, $chunk_offset, 4 ) ) { $xref_length = self::_parse_pdf_xref_section( $file_name, $file_offset + $chunk_offset + 4 ); //error_log( __LINE__ . " MLAPDF::_extract_pdf_trailer xref_length = " . var_export( $xref_length, true ), 0 ); $chunk_offset += 4 + $xref_length; if ( $chunk_offset > ( $chunksize - 1024 ) ) { $file_offset += $chunk_offset; $tail = file_get_contents( $file_name, true, NULL, $file_offset, $chunksize ); $chunk_offset = 0; } //error_log( __LINE__ . " MLAPDF::_extract_pdf_trailer( {$file_offset} ) tail = \r\n" . MLAData::mla_hex_dump( $tail, 0, 16, 0 ), 0 ); $match_count = preg_match( '/[\x00-\x20]*trailer[\x00-\x20]+/', $tail, $matches, PREG_OFFSET_CAPTURE, $chunk_offset ); //error_log( __LINE__ . " MLAPDF::_extract_pdf_trailer( {$match_count} ) matches = " . var_export( $matches, true ), 0 ); if ( $match_count ) { $chunk_offset = $matches[0][1] + strlen( $matches[0][0] ); $dictionary = self::_parse_pdf_dictionary( $tail, $chunk_offset ); //error_log( __LINE__ . " MLAPDF::_extract_pdf_trailer dictionary = " . var_export( $dictionary, true ), 0 ); if ( isset( $dictionary['Prev'] ) ) { $other_trailers = self::_extract_pdf_trailer( $file_name, $dictionary['Prev']['value'] ); } else { $other_trailers = NULL; } if ( is_array( $other_trailers ) ) { $other_trailers = array_merge( $other_trailers, array( $dictionary ) ); return $other_trailers; } else { return array( $dictionary ); } } // found 'trailer' } else { // found 'xref' /* * Look for a cross-reference stream */ $match_count = preg_match( '!(\d+)\\h+(\d+)\\h+obj[\x00-\x20]*!', $tail, $matches, PREG_OFFSET_CAPTURE ); if ( $match_count ) { $chunk_offset = $matches[0][1] + strlen( $matches[0][0] ); if ( '<<' == substr( $tail, $chunk_offset, 2) ) { $dictionary = self::_parse_pdf_dictionary( $tail, $chunk_offset ); /* * Parse the cross-reference stream following the dictionary, if present */ if ( isset( $dictionary['Type'] ) && 'XRef' == $dictionary['Type']['value'] ) { $xref_length = self::_parse_pdf_xref_stream( $file_name, $file_offset + $chunk_offset + (integer) $dictionary['/length'], $dictionary['W']['value'] ); } if ( isset( $dictionary['Prev'] ) ) { $other_trailers = self::_extract_pdf_trailer( $file_name, $dictionary['Prev']['value'] ); } else { $other_trailers = NULL; } if ( is_array( $other_trailers ) ) { $other_trailers = array_merge( array( $dictionary ), $other_trailers ); return $other_trailers; } else { return array( $dictionary ); } } // found cross-reference stream dictionary } // found cross-reference stream object } return NULL; } /** * Extract Metadata from a PDF file * * @since 2.10 * * @param string full path to the desired file * * @return array ( 'xmp' => array( key => value ), 'pdf' => array( key => value ) ) for each metadata field, in string format */ public static function mla_extract_pdf_metadata( $file_name ) { $xmp = array(); $metadata = array(); self::$pdf_indirect_objects = NULL; $chunksize = 16384; if ( ! file_exists( $file_name ) ) { return array( 'xmp' => $xmp, 'pdf' => $metadata ); } $filesize = filesize( $file_name ); $file_offset = ( $chunksize < $filesize ) ? ( $filesize - $chunksize ) : 0; $tail = file_get_contents( $file_name, false, NULL, $file_offset ); //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata( {$file_name}, {$file_offset} ) tail = \r\n" . MLAData::mla_hex_dump( $tail ), 0 ); if ( 0 == $file_offset ) { $header = substr( $tail, 0, 128 ); } else { $header = file_get_contents( $file_name, false, NULL, 0, 128 ); } //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata( {$file_name}, {$file_offset} ) header = \r\n" . MLAData::mla_hex_dump( $header ), 0 ); if ( '%PDF-' == substr( $header, 0, 5 ) ) { $metadata['PDF_Version'] = substr( $header, 1, 7 ); $metadata['PDF_VersionNumber'] = substr( $header, 5, 3 ); } /* * Find the xref and (optional) trailer */ $match_count = preg_match_all( '/startxref[\x00-\x20]+(\d+)[\x00-\x20]+\%\%EOF/', $tail, $matches, PREG_OFFSET_CAPTURE ); if ( 0 == $match_count ) { /* translators: 1: ERROR tag 2: path and file */ MLACore::mla_debug_add( sprintf( _x( '%1$s: File "%2$s", startxref not found.', 'error_log', 'media-library-assistant' ), __( 'ERROR', 'media-library-assistant' ), $path ), MLACore::MLA_DEBUG_CATEGORY_ANY ); return array( 'xmp' => $xmp, 'pdf' => $metadata ); } $startxref = (integer) $matches[1][ $match_count - 1 ][0]; $trailer_dictionaries = self::_extract_pdf_trailer( $file_name, $startxref ); //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata trailer_dictionaries = " . var_export( $trailer_dictionaries, true ), 0 ); if ( is_array( $trailer_dictionaries ) ) { $info_reference = NULL; foreach ( $trailer_dictionaries as $trailer_dictionary ) { if ( isset( $trailer_dictionary['Info'] ) ) { $info_reference = $trailer_dictionary['Info']; break; } } //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_reference = " . var_export( $info_reference, true ), 0 ); if ( isset( $info_reference ) ) { $info_object = self::_find_pdf_indirect_dictionary( $file_name, $info_reference['object'], $info_reference['generation'] ); /* * Handle single or multiple Info instances */ $info_objects = array(); if ( $info_object ) { if ( 1 == $info_object['count'] ) { $info_objects[] = $info_object; } else { for ( $index = 0; $index < $info_object['count']; $index++ ) { $info_objects[] = self::_find_pdf_indirect_dictionary( $file_name, $info_reference['object'], $info_reference['generation'], $index ); } } } //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_objects = " . var_export( $info_objects, true ), 0 ); foreach( $info_objects as $info_object ) { $info_dictionary = self::_parse_pdf_dictionary( $info_object['content'], 0 ); //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_dictionary = " . var_export( $info_dictionary, true ), 0 ); unset( $info_dictionary['/length'] ); foreach ( $info_dictionary as $name => $value ) { if ( 'string' == $value['type'] ) { $prefix = substr( $value['value'], 0, 2 ); if ( 'D:' == $prefix ) { $metadata[ $name ] = MLAData::mla_parse_pdf_date( $value['value'] ); } elseif ( ( chr(0xFE) . chr(0xFF) ) == $prefix ) { $metadata[ $name ] = self::_parse_pdf_UTF16BE( $value['value'] ); } else { $metadata[ $name ] = $value['value']; } } else { $metadata[ $name ] = $value['value']; } } // each info entry } // foreach Info object /* * Remove spurious "Filter" dictionaries */ unset( $metadata['Filter'] ); unset( $metadata['Length'] ); unset( $metadata['Length1'] ); } // found Info reference //error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata pdf metadata = ' . var_export( $metadata, true ), 0 ); /* * Look for XMP Metadata */ $root_reference = NULL; //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata info_dictionary = " . var_export( $info_dictionary, true ), 0 ); foreach ( $trailer_dictionaries as $trailer_dictionary ) { if ( isset( $trailer_dictionary['Root'] ) ) { $root_reference = $trailer_dictionary['Root']; break; } } //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata root_reference = " . var_export( $root_reference, true ), 0 ); if ( isset( $root_reference ) ) { $root_object = self::_find_pdf_indirect_dictionary( $file_name, $root_reference['object'], $root_reference['generation'] ); //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata root_object = " . var_export( $root_object, true ), 0 ); if ( $root_object ) { $root_dictionary = self::_parse_pdf_dictionary( $root_object['content'], 0 ); //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata root_dictionary = " . var_export( $root_dictionary, true ), 0 ); unset( $root_dictionary['/length'] ); if ( isset( $root_dictionary['Metadata'] ) ) { $xmp_object = self::_find_pdf_indirect_dictionary( $file_name, $root_dictionary['Metadata']['object'], $root_dictionary['Metadata']['generation'] ); //error_log( __LINE__ . " MLAPDF::mla_extract_pdf_metadata xmp_object = " . var_export( $xmp_object, true ), 0 ); $xmp = MLAData::mla_parse_xmp_metadata( $file_name, $xmp_object['start'] + $xmp_object['length'] ); if ( is_array( $xmp ) ) { $metadata = array_merge( $metadata, $xmp ); } else { $xmp = array(); $xmp = MLAData::mla_parse_xmp_metadata( $file_name, 0 ); //error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata recovered xmp = ' . var_export( $xmp, true ), 0 ); } } // found Metadata reference } // found Root object } // found Root reference } // found trailer_dictionaries //error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata pdf = ' . var_export( $metadata, true ), 0 ); //error_log( __LINE__ . ' MLAPDF::mla_extract_pdf_metadata xmp = ' . var_export( $xmp, true ), 0 ); return array( 'xmp' => $xmp, 'pdf' => $metadata ); } } // class MLAPDF ?>