#!/usr/bin/env perl # # Copyright (c) 2007, Gregory Fleischer (gfleischer@gmail.com) # # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # 3. The names of the authors may not be used to endorse or promote # products derived from this software without specific prior # written permission. # # THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. # # # # docinfo.pl - retrieve document information from Word document # # uses Harlan Carvey's MSWord package # see http://windowsir.blogspot.com/2006/09/metadata-and-ediscovery.html # use File::MSWord; use strict; sub usage; sub error; sub trace; sub info; sub addbuilddate; ### main eval { my $file = shift() or usage(1); if (! -f $file) { error("file [$file] doesn't exist"); } trace("processing [$file]"); my $doc = File::MSWord::new($file); my @guid = $doc->getGUID(); info("document guid: %x %x", @guid); info("---=== Summary Information ===---"); #~ subject #~ lastauth #~ lastprinted #~ appname #~ created #~ lastsaved #~ revnum #~ title my %summary = $doc->getSummaryInfo(); info("Title: %s", $summary{title}); info("Subject: %s", $summary{subject}); info("Application: %s", $summary{appname}); info("Last author: %s", $summary{lastauth}); info("Created: %s", $summary{created}); info("Last saved: %s", $summary{lastsaved}); info("Last printed: %s", $summary{lastprinted}); info("Revision Number: %s", $summary{revnum}); my %docSummary = $doc->getSummaryInfo(); #~ org info("Organization: %s", $docSummary{org}); my %binaryData = $doc->getDocBinaryData(); trace("binary data:"); my $k; foreach $k (keys %binaryData) { trace("%s: [%s]", $k, $binaryData{$k}); } # language my %langXlat = ( 1025 => "Arabic (Saudi Arabia)", 1069 => "Basque", 1026 => "Bulgarian", 1027 => "Catalan", 2052 => "Chinese (Simplified)", 1028 => "Chinese (Traditional)", 1050 => "Croatian", 1029 => "Czech", 1030 => "Danish", 1043 => "Dutch (Standard)", 1033 => "English", 1035 => "Finnish", 3084 => "French (Canadian)", 1036 => "French (Standard)", 1031 => "German", 1032 => "Greek", 1037 => "Hebrew", 1038 => "Hungarian", 1057 => "Indonesian", 1040 => "Italian (Standard)", 1041 => "Japanese", 1042 => "Korean", 1044 => "Norwegian (Bokmal)", 1045 => "Polish", 1046 => "Portuguese (Brazilian)", 2070 => "Portuguese (Standard)", 1048 => "Romanian", 1049 => "Russian", 1051 => "Slovak", 1060 => "Slovene", 1034 => "Spanish (Traditional Sort)", 1053 => "Swedish", 1054 => "Thai", 1055 => "Turkish", ); my $langid = $binaryData{langid}; if (exists $langXlat{$langid}) { trace("Document: Language = [%s] (%d)", $langXlat{$langid}, $langid); } else { trace("Document: Language = [%s] (%d)", "???", $langid); } info("Language: %s", $doc->getLangID($langid)); # characteristics info("---=== Document Characteristics ===---"); my %fDotXlat = ( 0x0001 => "Is template", 0x0002 => "Is glossary", # 0x0004 => sub { if (1 == @_[0]) { "Complex, fast-saved format" }; }, 0x0008 => "Contains one or more pictures", # 0x00F0 => sub { if (@_[0]) { sprintf ("Quick saved [%d] times", (@_[0]>>4)); } }, 0x0100 => "Is encrypted", 0x0400 => "Recommend read-only", 0x0800 => "Write is reserved", 0x1000 => "Uses extended character set", ); foreach $k (keys %fDotXlat) { my $b = $binaryData{fDot} & $k; if ($b) { my $v = $fDotXlat{$k}; if (ref($v) eq "CODE") { info(" %s", &$v($b)); } else { info(" %s", $v); } } } # determine where file created and last saved my %magicXlat = ( 0x6a62 => "MS Word 97", 0x626a => "Word 98 Mac", 0xa5dc => "Word 6.0/7.0", 0xa5ec => "Word 8.0", ); # program / platform my @created = ("???","???", ""); my @revised = ("???","???", ""); # program my @magic = $doc->getMagicIDs(); trace("magic[0]=%x,magic[1]=%x", @magic); if (exists $magicXlat{$magic[0]}) { $created[0] = $magicXlat{$magic[0]}; } if (exists $magicXlat{$magic[1]}) { $revised[0] = $magicXlat{$magic[1]}; } # platform if ($binaryData{envr} & 0x01) { $created[1] = "a Mac"; } else { $created[1] = "Windows"; } if ($binaryData{fMac} & 0x01) { $revised[1] = "a Mac"; } elsif ($binaryData{fMac} & 0x10) { $revised[1] = "Windows"; } my @buildDates = $doc->getBuildDates(); trace("buildDates[0]=%d,buildDates[1]=%d", @buildDates); $created[2] = addbuilddate($buildDates[0]); $revised[2] = addbuilddate($buildDates[1]); info("Created by %s on %s%s", @created); info("Revised by %s on %s%s", @created); my @savedBy = $doc->getSavedBy(); my $buf = $doc->readStreamTable($savedBy[0],$savedBy[1]); my %revisions = $doc->parseSTTBF($buf, "author", "path"); info("---=== Author(s) Information ===---"); foreach $k (sort { $a <=> $b } keys(%revisions)) { my $author = $revisions{$k}{"author"}; if ($author=~/^\s*$/) { $author = "(unknown)"; } my $path = $revisions{$k}{"path"}; if ($path=~/^\s*$/) { $path = "(no path)"; } info("%d. %s : %s", $k, $author, $path); } }; if ($@) { error($@); } exit(0); sub error() { my $msg = @_[0]; chomp($msg); print $msg, "\n"; exit(2); } sub usage() { print "$0 \n"; exit(@_[0]); } sub trace() { my $fmt = shift(); # print sprintf($fmt, @_), "\n"; } sub info() { my $fmt = shift(); print sprintf($fmt, @_), "\n"; } sub addbuilddate { my $d = @_[0]; if ($d=~/(\d{1,2})(\d{2})(\d{2})/) { my $mon = $1; my $day = $2; my $yr = $3; if ($yr < 50) { $yr = 2000 + $yr; } else { $yr = 1900 + $yr; } return sprintf(" (build date %02d/%02d/%04d)", $mon, $day, $yr); } return ""; } # eof