Ticket #145: chm.diff

File chm.diff, 9.8 kB (added by nneonneo <nneonneo@gmail.com>, 1 year ago)

Patch to hachoir-parser/hachoir_parser/misc/chm.py

  • chm.py

    old new  
    66  http://www.wotsit.org (search "chm") 
    77- chmlib library 
    88  http://www.jedrea.com/chmlib/ 
     9- Unofficial CHM Spec 
     10  http://savannah.nongnu.org/projects/chmspec 
     11- Microsoft's HTML Help (.chm) format 
     12  http://www.speakeasy.org/~russotto/chm/chmformat.html 
    913 
    1014Author: Victor Stinner 
    1115Creation date: 2007-03-04 
     
    1317 
    1418from hachoir_parser import Parser 
    1519from hachoir_core.field import (Field, FieldSet, ParserError, 
    16     Int32, UInt32, UInt64, 
     20    Int32, UInt16, UInt32, UInt64, 
    1721    RawBytes, PaddingBytes, 
    1822    Enum, String) 
    1923from hachoir_core.endian import LITTLE_ENDIAN 
     
    4246                raise ParserError("CHM: CWord is limited to 64 bits") 
    4347            addr += 8 
    4448            byte = stream.readBits(addr, 8, endian) 
     49        value <<= 7 
    4550        value += byte 
    4651        self.createValue = lambda: value 
    4752 
     
    99104    def createFields(self): 
    100105        yield CWord(self, "name_len") 
    101106        yield String(self, "name", self["name_len"].value, charset="UTF-8") 
    102         yield CWord(self, "space") 
    103         yield CWord(self, "start"
    104         yield filesizeHandler(CWord(self, "length")) 
     107        yield CWord(self, "section", "Section number that the entry data is in.") 
     108        yield CWord(self, "start", "Start offset of the data"
     109        yield filesizeHandler(CWord(self, "length", "Length of the data")) 
    105110 
    106111    def createDescription(self): 
    107112        return "%s (%s)" % (self["name"].value, self["length"].display) 
     
    118123 
    119124        # Entries 
    120125        stop = self.size - self["free_space"].value * 8 
     126        entry_count = 0 
    121127        while self.current_size < stop: 
    122128            yield PMGL_Entry(self, "entry[]") 
     129            entry_count+=1 
    123130 
    124131        # Padding 
    125         padding = (self.size - self.current_size) // 8 
     132        quickref_frequency = 1 + (1 << self["/dir/itsp/density"].value) 
     133        num_quickref = (entry_count // quickref_frequency) 
     134        print self.current_size//8, quickref_frequency, num_quickref 
     135        padding = (self["free_space"].value - (num_quickref*2+2)) 
    126136        if padding: 
    127137            yield PaddingBytes(self, "padding", padding) 
     138        for i in range(num_quickref*quickref_frequency, 0, -quickref_frequency): 
     139            yield UInt16(self, "quickref[%i]"%i) 
     140        yield UInt16(self, "entry_count") 
    128141 
    129142class PMGI_Entry(FieldSet): 
    130143    def createFields(self): 
     
    158171 
    159172        if nb_dir < 0: 
    160173            nb_dir = 1 
     174        entries=[] 
    161175        for index in xrange(nb_dir): 
    162             yield PMGL(self, "pmgl[]", size=block_size) 
     176            directory=PMGL(self, "pmgl[]", size=block_size) 
     177            yield directory 
     178            entries+=directory.array("entry") 
    163179 
     180        self.entries=entries 
    164181        if self.current_size < self.size: 
    165182            yield PMGI(self, "pmgi", size=block_size) 
    166183 
     184class NameList(FieldSet): 
     185    def createFields(self): 
     186        yield UInt16(self, "length", "Length of name list in 2-byte blocks") 
     187        yield UInt16(self, "count", "Number of entries in name list") 
     188        for index in range(self["count"].value): 
     189            length=UInt16(self, "name_len[]", "Length of name in 2-byte blocks, excluding terminating null") 
     190            yield length 
     191            yield String(self, "name[]", length.value*2+2, charset="UTF-16-LE") 
     192 
     193class ControlData(FieldSet): 
     194    def createFields(self): 
     195        yield UInt32(self, "count", "Number of DWORDS in this struct") 
     196        yield String(self, "type", 4, "Type of compression") 
     197        if self["type"].value!='LZXC': return 
     198        yield UInt32(self, "version", "Compression version") 
     199        version=self["version"].value 
     200        if version==1: block='bytes' 
     201        else: block='32KB blocks' 
     202        yield UInt32(self, "reset_interval", "LZX: Reset interval in %s"%block) 
     203        yield UInt32(self, "window_size", "LZX: Window size in %s"%block) 
     204        yield UInt32(self, "cache_size", "LZX: Cache size in %s"%block) 
     205        yield UInt32(self, "unknown[]") 
     206 
     207class ResetTable(FieldSet): 
     208    def createFields(self): 
     209        yield UInt32(self, "unknown[]", "Version number?") 
     210        yield UInt32(self, "count", "Number of entries") 
     211        yield UInt32(self, "entry_size", "Size of each entry") 
     212        yield UInt32(self, "header_size", "Size of this header") 
     213        yield UInt64(self, "uncompressed_size") 
     214        yield UInt64(self, "compressed_size") 
     215        yield UInt64(self, "block_size", "Block size in bytes") 
     216        for i in xrange(self["count"].value): 
     217            yield UInt64(self, "block_location[]", "location in compressed data of 1st block boundary in uncompressed data") 
     218 
     219class SystemEntry(FieldSet): 
     220    ENTRY_TYPE={0:"HHP: [OPTIONS]: Contents File", 
     221                1:"HHP: [OPTIONS]: Index File", 
     222                2:"HHP: [OPTIONS]: Default Topic", 
     223                3:"HHP: [OPTIONS]: Title", 
     224                4:"File Metadata", 
     225                5:"HHP: [OPTIONS]: Default Window", 
     226                6:"HHP: [OPTIONS]: Compiled file", 
     227                # 7 present only in files with Binary Index; unknown function 
     228                # 8 unknown function 
     229                9: "Version", 
     230                10: "Timestamp", 
     231                # 11 only in Binary TOC files 
     232                12: "Number of Info Types", 
     233                13: "#IDXHDR file", 
     234                # 14 unknown function 
     235                # 15 checksum?? 
     236                16:"HHP: [OPTIONS]: Default Font", 
     237    } 
     238    def createFields(self): 
     239        yield Enum(UInt16(self, "type", "Type of entry"),self.ENTRY_TYPE) 
     240        yield UInt16(self, "length", "Length of entry") 
     241        yield RawBytes(self, "data", self["length"].value) 
     242    def createDescription(self): 
     243        return '#SYSTEM Entry, Type %s'%self["type"].display 
     244         
     245class SystemFile(FieldSet): 
     246    def createFields(self): 
     247        yield UInt32(self, "version", "Either 2 or 3") 
     248        while self.current_size < self.size: 
     249            yield SystemEntry(self, "entry[]") 
     250 
    167251class ChmFile(Parser): 
    168252    tags = { 
    169253        "id": "chm", 
     
    176260    endian = LITTLE_ENDIAN 
    177261 
    178262    def validate(self): 
    179         if self.stream.readBytes(0, 4) != "ITSF": 
     263        if self["itsf/magic"].value != "ITSF": 
    180264            return "Invalid magic" 
    181265        if self["itsf/version"].value != 3: 
    182266            return "Invalid version" 
     
    189273        padding = self.seekByte(self["itsf/dir_offset"].value) 
    190274        if padding: 
    191275            yield padding 
    192         yield Directory(self, "dir", size=self["itsf/dir_len"].value*8) 
     276        directory=Directory(self, "dir", size=self["itsf/dir_len"].value*8) 
     277        yield directory 
    193278 
     279        # alternately, this could be a SeekableFieldSet, which would eliminate 
     280        # the need for running through all entries and would produce them in 
     281        # the order that they appear in the index. However, then, it would not 
     282        # be possible to see every byte of the file, as some may be skipped over. 
     283        for field in directory: 
     284            pass # exhaust directory to read all entries in order to sort them 
     285 
     286        entries=sorted(directory.entries,key=lambda x:x["start"].value) 
     287        otherentries={} 
     288        for entry in entries: 
     289            if entry["section"].value != 0: 
     290                otherentries.setdefault(entry["section"].value,[]).append(entry) 
     291                continue 
     292            if entry["length"].value==0: continue 
     293            padding=self.seekByte(self["itsf/data_offset"].value+entry["start"].value) 
     294            if padding: 
     295                yield padding 
     296            name=entry["name"].value 
     297            if name=="::DataSpace/NameList": 
     298                yield NameList(self, "name_list") 
     299            elif name.startswith('::DataSpace/Storage/'): 
     300                sectname=str(name.split('/')[2]) 
     301                if name.endswith('/SpanInfo'): 
     302                    yield UInt64(self, "%s_spaninfo"%sectname, "Size of uncompressed data in the %s section"%sectname) 
     303                elif name.endswith('/ControlData'): 
     304                    yield ControlData(self, "%s_controldata"%sectname, "Data about the compression scheme", size=entry["length"].value*8) 
     305                elif name.endswith('/Transform/List'): 
     306                    yield String(self, "%s_transform_list"%sectname, 38, description="Transform/List element", charset="UTF-16-LE") 
     307                elif name.endswith('/Transform/{7FC28940-9D31-11D0-9B27-00A0C91E9C7C}/InstanceData/ResetTable'): 
     308                    yield ResetTable(self, "%s_reset_table"%sectname, "LZX Reset Table", size=entry["length"].value*8) 
     309                elif name.endswith('/Content'): 
     310                    # eventually, a LZX wrapper will appear here, we hope! 
     311                    yield RawBytes(self, "%s_content"%sectname, entry["length"].value, "Content for the %s section"%sectname) 
     312                else: 
     313                    yield RawBytes(self, "entry_data[]", entry["length"].value, name) 
     314            elif name=="/#SYSTEM": 
     315                yield SystemFile(self, "system_file", size=entry["length"].value*8) 
     316            else: 
     317                yield RawBytes(self, "entry_data[]", entry["length"].value, name) 
     318 
    194319        size = (self.size - self.current_size) // 8 
    195320        if size: 
    196321            yield RawBytes(self, "raw_end", size) 
    197322 
     323    def getFile(self, filename): 
     324        page=0 
     325        if 'pmgi' in self['/dir']: 
     326            for entry in self['/dir/pmgi'].array('entry'): 
     327                if entry['name'].value <= filename: 
     328                    page=entry['page'].value 
     329        pmgl=self['/dir/pmgl[%i]'%page] 
     330        for entry in pmgl.array('entry'): 
     331            if entry['name'].value == filename: 
     332                return entry 
     333        raise ParserError("File '%s' not found!"%filename) 
     334 
    198335    def createContentSize(self): 
    199336        return self["file_size/file_size"].value * 8 
    200337