Microsoft Word Autocorrects

From:

Roedy Green <see_website@mindprod.com.invalid>

Newsgroups:

comp.lang.java.programmer

Date:

Tue, 02 Mar 2010 03:39:24 -0800

Message-ID:

<g1upo5phf1djl0rq5dg2n8kmfoem45jqa8@4ax.com>

I have been attempting to decode a file used my Microsoft Word, one
that stores the autocorrects, possibly to create some utilities to
manage them, export, import etc.

I wrote this little program to dump out the contents of an autocorrect
file to test my understanding of the format. It had a number of
unexpected wrinkles after I thought I had it nailed.

There are still few puzzles left. I thought some people here might
enjoy the challenge of solving them. If not, just ignore this post.

/*
* @(#)Autocorrects.java
*
* Summary: read MS Word autocorrects file.
*
* Copyright: (c) 2010 Roedy Green, Canadian Mind Products,
http://mindprod.com
*
* Licence: This software may be copied and used freely for any
purpose but military.
* http://mindprod.com/contact/nonmil.html
*
* Requires: JDK 1.6+
*
* Created with: IntelliJ IDEA IDE.
*
* Version History:
* 1.0 2010-03-01
*/
package com.mindprod.example;

import com.mindprod.common11.StringTools;

import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.EOFException;
import java.io.FileInputStream;
import java.io.IOException;

import static java.lang.System.out;

/**
* read MS Word autocorrects file
* <p/>
* Outstanding puzzles:
* 1. what is the significance of the 0x2000 and 0x2100 spacers?
* 2. what are the bits in the mystery region used for?
* 3. How is the language encoded if it indeed is?
* 4. Precisely how do all the options map onto bits?
*
* @author Roedy Green, Canadian Mind Products
* @version 1.0 2006-03-24
* @noinspection WeakerAccess
* @since 2006-03-24
*/

public class Autocorrects
    {
    // ------------------------------ CONSTANTS
------------------------------

    @SuppressWarnings( { "UnusedDeclaration" } )
    private static final String BRITISH_AUTOCORRECTS =
"C:/Users/Roedy/AppData/roaming/Microsoft/Office/MSO2057.acl";

    @SuppressWarnings( { "UnusedDeclaration" } )
    private static final String AMERICAN_AUTOCORRECTS =
"C:/Users/Roedy/AppData/roaming/Microsoft/Office/MSO1033.acl";

    @SuppressWarnings( { "UnusedDeclaration" } )
    private static final String FRENCH_AUTOCORRECTS = "F:\\Program
Files (x86)\\Microsoft Office\\Office\\1036\\mso.acl";

    /**
     * table to convert a nibble to a hex char.
     */
    static char[] hexChar = {
            '0', '1', '2', '3',
            '4', '5', '6', '7',
            '8', '9', 'a', 'b',
            'c', 'd', 'e', 'f' };
    // -------------------------- PUBLIC STATIC METHODS
--------------------------

    /**
     * Fast convert a byte array to a hex string
     * with possible leading zero.
     *
     * @param bs array of bytes to convert to string
     *
     * @return hex representation, two chars per byte.
     */
    public static String toHexString( byte[] bs )
        {
        StringBuffer sb = new StringBuffer( bs.length * 2 );
        for ( byte b : bs )
            {
            // look up high nibble char
            sb.append( hexChar[ ( b & 0xf0 ) >>> 4 ] );

            // look up low nibble char
            sb.append( hexChar[ b & 0x0f ] );
            }
        return sb.toString();
        }

    // --------------------------- main() method
---------------------------

    public static void main( String[] args ) throws IOException
        {
        // O P E N
        final FileInputStream fis = new FileInputStream(
FRENCH_AUTOCORRECTS );
        //
        final BufferedInputStream bis = new BufferedInputStream( fis,
65536 /* 64K bytes */ );
        final DataInputStream dis = new DataInputStream( bis );

        int offset = 0;
        // 4 bytes : signature 04 01 96 00
        byte[] signature = new byte[4];
         if ( dis.read( signature ) != 4 )
             {
             throw new IllegalArgumentException("trouble reading
signature bytes");
             }
        out.println( "signature: " + toHexString( signature ) );
        offset += 4;

        // 4 bytes : option e.g. 22 c0 ef 05
        byte[] options = new byte[4];
         if ( dis.read( options ) != 4 )
             {
             throw new IllegalArgumentException("trouble reading
option bytes");
             }
        out.println( "options: " + toHexString( options ) );
        offset += 4;

        // 4 bytes: little endian length of file in bytes
        int fileLength = Integer.reverseBytes( dis.readInt() );
        out.println( "file length: " + StringTools.toLZHexString(
fileLength, 4 ) );
        offset += 4;

        // 5 bytes mystery e.g 93 03 00 00 b9 or 9f 03 00 00 53
        byte[] mystery = new byte[5];
         if ( dis.read( mystery ) != 5 )
             {
             throw new IllegalArgumentException("trouble reading
mystery bytes");
             }
        out.println( "mystery: " + toHexString( mystery ) );
        offset += 5;

        int count = 0;

        try
            {
            while ( true )
                {
                out.print( StringTools.toLZHexString( offset, 4 ) );

                // bypass possibly multiple spacer 0s. usually 1.
                // Why bother with spacers? probably originally to
make the file more acceptable to C++
                // which likes 0-terminators on its strings.
                // Why variable numbers of them? Probably just to
mess with the minds of
                // people attempting to export the data.
                int lena = 0;
                while ( lena == 0 || lena == 0x2000 || lena ==
0x2100)
                    {
                    // read length or spacer
                    lena = dis.readShort();
                    offset += 2;
                    }
                // read abbreviation UTF-16BE
                if ( !( 1 <= lena && lena <= 1024 ) )
                    {
                    throw new IllegalArgumentException( "corrupt file
abbreviation length: " + lena );
                    }
                final char[] abbr = new char[lena];
                for ( int i = 0; i < lena; i++ )
                    {
                    abbr[ i ] = dis.readChar();
                    }
                offset += lena * 2;

                // bypass possibly multiple spacer 0s. Usually 1.
                int lene = 0;
                while ( lene == 0 || lene == 0x2000 || lene == 0x2100
)
                    {
                    // read length or spacer
                    lene = dis.readShort();
                    offset += 2;
                    }

                // read expansion UTF-16BE
                if ( !( 1 <= lene && lene <= 1024 ) )
                    {
                    throw new IllegalArgumentException( "corrupt file
expansion length: " + lene );
                    }

               final char[] expansion = new char[lene];
                for ( int i = 0; i < lene; i++ )
                    {
                    expansion[ i ] = dis.readChar();
                    }

                offset += lene * 2;
                count++;
                out.println( " " + count + " " + new String( abbr ) +
" --> " + new String( expansion ) );
                }
            }
        catch ( EOFException e )
            {
            dis.close();
            }
        }
    }
--
Roedy Green Canadian Mind Products
http://mindprod.com

The major difference between a thing that might go wrong and a thing that cannot possibly go wrong is that when a thing that cannot possibly go wrong goes wrong it usually turns out to be impossible to get at or repair.
~ Douglas Adams (born: 1952-03-11 died: 2001-05-11 at age: 49)