Refactoring exercise.

Roedy Green <>
Sat, 03 Nov 2012 01:37:20 -0700
This is a beginner's exercise in refactoring.

Below is posted the text for Amper.
You can get any other code it needs or find out more about what it is
for from

Notice how similar the methods ampifyPossiblyCommentedString and
ampifyPossiblyScriptedString are.

Your task is to refactor out that commonality in handling of text
inside/outside markers so that the logic appears only once.

Hint: see

This is aimed at newbies. If you are an old-timer, see if you could
come up with something clever to make the code more readable and

You don't have to show a complete working program, just the refactored

 * []
 * Summary: amper, converts invalid & to &amp; in html.
 * Copyright: (c) 1999-2012 Roedy Green, Canadian Mind Products,
 * Licence: This software may be copied and used freely for any
purpose but military.
 * Requires: JDK 1.5+
 * Created with: JetBrains IntelliJ IDEA IDE
 * Version History:
 * 1.1 2006-03-05
 * 1.2 2007-03-26 fix bug in StripEntities. Was not doing &#xffff;
 * 1.3 2007-04-07 recover from crash. Tidy code.
 * 1.4 2007-05-10 add icon, PAD file.
 * 1.5 2007-06-29 add -q command line support. New CommandLine
 * 1.6 2008-08-03 change detail parameter so that you can request
three levels of detail, rather than two.
 * 1.7 2012-01-25 now handles HTML5 entities. It now leaves any
unusual entities as is.
 * 1.8 2012-02-09 fix bug. Now handles even very longest HTML5
entities. No longer extends DeEntifyStrings.
 * 1.9 2012-06-18 allow you to ampify .htm and .csv files
 * 2.0 2012-11-03 deal text inside <script is no longer ampified.
 * new methods ampifyPossiblyScriptedString(String)
 * deprecated ampifyCommented.
package com.mindprod.amper;

import com.mindprod.commandline.CommandLine;
import com.mindprod.common11.Misc;
import com.mindprod.filter.AllButSVNDirectoriesFilter;
import com.mindprod.filter.ExtensionListFilter;
import com.mindprod.hunkio.HunkIO;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static java.lang.System.out;

 * amper, converts invalid & to &amp; in html.
 * <p/>
 * @author Roedy Green, Canadian Mind Products
 * @version 2.0 2012-11-03 deal text inside <script is no longer
 * new methods ampifyPossiblyScriptedString(String)
 * deprecated ampifyCommented.
 * @noinspection WeakerAccess
 * @since 1999
public final class Amper
    // ------------------------------ CONSTANTS

     * true if want extra debug output
    private static final boolean DEBUGGING = false;

     * Longest an HTML 5 entity can be, at least in our tables,
including the lead & and trail ;.
    static final int LONGEST_HTML5_ENTITY =

     * undisplayed copyright notice.
     * @noinspection UnusedDeclaration
    public static final String EMBEDDED_COPYRIGHT =
            "Copyright: (c) 1999-2012 Roedy Green, Canadian Mind

     * date this version released.
     * @noinspection UnusedDeclaration
    private static final String RELEASE_DATE = "2012-06-18";

     * how to use the command line
    private static final String USAGE = "Amper needs a filename.html
or a space-separated list of filenames, " +
                                        "with optional -s -q -v

     * embedded version string.
     * @noinspection UnusedDeclaration
    public static final String VERSION_STRING = "1.9";

     * pattern to detect entity less lead & with trail ; alpha, &#x
hex or &# numeric
    private static final Pattern ENTITY_PATTERN = Pattern.compile(
"\\p{Alnum}{2," + ( LONGEST_HTML5_ENTITY - 2 ) +
 "};|#x[0-9a-fA-F]{1,8};|#\\p{Digit}{1,10};" );

    // -------------------------- PUBLIC STATIC METHODS

     * convert all & except ones in comments to &amp;.
     * @param big string possibly containing & and comments, but no
     * @return compacted string.
     * @noinspection WeakerAccess
     * @see #ampifyPossiblyScriptedString(String)
     * @see #ampifyPossiblyCommentedString(String)
     * @see #ampifyUncommentedString(String)
     * @deprecated renamed to ampifyPossiblyCommentedString . You
probably really want
     * ampifyPossiblyScriptedString.
    public static String ampifyCommented( String big )
        return ampifyPossiblyCommentedString( big );

     * fix amps in one file.
     * @param fileBeingProcessed the file currently being processed.
     * @param detail 0=out output at all, 1=just files
changed, 2=all files.
     * @throws IOException if trouble reading or writing file
     * @noinspection SameParameterValue, WeakerAccess
     * @see #ampifyPossiblyScriptedString(String)
     * @see #ampifyPossiblyCommentedString(String)
     * @see #ampifyUncommentedString(String)
    public static void ampifyFile( File fileBeingProcessed,
                                   int detail ) throws IOException
        if ( !( fileBeingProcessed.getName().endsWith( ".html" )
                || fileBeingProcessed.getName().endsWith( ".htm" )
                || fileBeingProcessed.getName().endsWith( ".csv" ) ) )
            out.println( "Cannot amp: "
                         + fileBeingProcessed.getName()
                         + "not .html .htm .csv file" );
        String big = HunkIO.readEntireFile( fileBeingProcessed );
        String result = ampifyPossiblyScriptedString( big );
        if ( result.equals( big ) )
            // nothing changed. No need to write results.
            if ( detail >= 2 )
                out.println( "- " + fileBeingProcessed.getName() );
        // generate output into a temporary file until we are sure all
is ok.
        // create a temp file in the same directory as filename
        if ( detail >= 1 )
            // it changed
            out.println( "* " + fileBeingProcessed.getName() );
        final File tempFile = HunkIO.createTempFile( "temp", ".tmp",
fileBeingProcessed );
        FileWriter emit = new FileWriter( tempFile );
        emit.write( result );
        // successfully created output in same directory as input,
        // Now make it replace the input file.
        Misc.deleteAndRename( tempFile, fileBeingProcessed );

     * convert all & except ones in comments to &amp;.
     * @param big string possibly containing & and comments, but no
     * @return tidied string.
     * @noinspection WeakerAccess
     * @see #ampifyPossiblyScriptedString(String)
     * @see #ampifyUncommentedString(String)
    public static String ampifyPossiblyCommentedString( String big )
        int originalLength = big.length();
        final StringBuilder sb = new StringBuilder( originalLength );
        // indexes which character we are working on
        int i = 0;
        while ( i < originalLength )
            // search for start of comment
            int startCommentPlace = big.indexOf( "<!--", i );
            if ( startCommentPlace < 0 )
                // no more comments, finish off this last chunk
                sb.append( ampifyUncommentedString( big.substring( i,
                        originalLength ) ) );
            // we found the start of a comment
            // process html in front of comment, possibly empty
            sb.append( ampifyUncommentedString( big.substring( i,
                    startCommentPlace ) ) );
            // find the end of comment
            int endCommentPlace =
                    big.indexOf( "-->", startCommentPlace +
"<!--".length() );
            if ( endCommentPlace < 0 )
                throw new IllegalArgumentException( "missing --> on a
comment" );
            endCommentPlace += "-->".length();
            String commentText = big.substring( startCommentPlace,
endCommentPlace );
            // make sure the comments not malformed. Should be no
embedded start
            // comment marker
            String commentGuts = commentText.substring(
"<!--".length(), commentText.length() - "-->".length() );
            if ( commentGuts.contains( "<!--" ) )
                throw new IllegalArgumentException( "<!-- ... --> not
balanced" );
            // output the comment unchanged
            sb.append( commentText );
            i = endCommentPlace;
            }// end while
        return sb.toString();

     * convert all & except ones in comments or inside <script to
     * @param big string possibly containing & and comments and
     * @return tidied string.
     * @noinspection WeakerAccess
     * @see #ampifyPossiblyCommentedString(String)
     * @see #ampifyUncommentedString(String)
    public static String ampifyPossiblyScriptedString( String big )
        int originalLength = big.length();
        final StringBuilder sb = new StringBuilder( originalLength );
        // indexes which character we are working on
        int i = 0;
        while ( i < originalLength )
            // search for start of <script
            int startScriptPlace = big.indexOf( "<script", i );
            if ( startScriptPlace < 0 )
                // no more scripts, finish off this last chunk
                sb.append( ampifyPossiblyCommentedString(
big.substring( i,
                        originalLength ) ) );
            // we found the start of a <script
            // process html in front of <script, possibly empty
            sb.append( ampifyPossiblyCommentedString( big.substring(
                    startScriptPlace ) ) );
            // find the end of script
            int endScriptPlace =
                    big.indexOf( "</script>", startScriptPlace +
"</script>".length() );
            if ( endScriptPlace < 0 )
                throw new IllegalArgumentException( "missing
</script>" );
            endScriptPlace += "</script>".length();
            String scriptText =
                    big.substring( startScriptPlace, endScriptPlace );
            // make sure the <scripts not malformed. Should be no
embedded start marker
            String scriptGuts = scriptText.substring(
"script".length(), scriptText.length() - "</script>".length() );
            if ( scriptGuts.contains( "script" ) )
                throw new IllegalArgumentException( "<script ...
</script> not balanced" );
            // output the script unchanged
            sb.append( scriptText );
            i = endScriptPlace;
            }// end while
        return sb.toString();

     * convert all & to &amp; unless it has been done already. Leaves
     * entities as is.
     * @param chunk the string to process
     * @return tidied string
     * @noinspection WeakerAccess
     * * @see #ampifyPossiblyScriptedString(String)
     * @see #ampifyPossiblyCommentedString(String)
    public static String ampifyUncommentedString( String chunk )
        // do a quick check. If chunk contains no &, we have nothing
to do,
        // guaranteed
        if ( !chunk.contains( "&" ) )
            return chunk;
        int length = chunk.length();
        final StringBuilder sb2 = new StringBuilder( length + 20 );
        int i = 0;
        while ( i < length )
            int ampPlace = chunk.indexOf( "&", i );
            if ( ampPlace < 0 )
                // all done, copy over the remaining chunk.
                sb2.append( chunk.substring( i, length ) );
                // don't need to increment i
            // we found an &
            // copy over stuff before the & we just found
            sb2.append( chunk.substring( i, ampPlace ) );
            i = ampPlace;
            // is it an &amp; or &lt; or some other entity already?

            // get string without lead & but with trailing ; if it
            final String candidate = chunk.substring( i + 1, Math.min(
i + LONGEST_HTML5_ENTITY, length ) );

            final Matcher m = ENTITY_PATTERN.matcher( candidate );
            // quick test. Just check pattern starting just after &
            if ( m.lookingAt() )
                // this was an entity already, leave it alone.
                sb2.append( '&' );
                // convert & to &amp;
                sb2.append( "&amp;" );
            }// end while
        return sb2.toString();

    // --------------------------- CONSTRUCTORS

     * constructor, not used.
     * @noinspection WeakerAccess
    private Amper()

    // --------------------------- main() method

     * fixes ampersands in HTML files.
     * @param args names of files to process, dirs, files, -s, *.*, no
    public static void main( String[] args )
        if ( DEBUGGING )
            out.println( ENTITY_PATTERN.toString() );
        // gather all the files mentioned on the command line.
        // either directories, files, with -s and subdirs option.
        // warning. Windows expands any wildcards in a nasty way.
        // do not use wildcards.
        // See
        out.println( "Gathering html files to &ampify..." );
        CommandLine commandLine = new CommandLine( args,
                new AllButSVNDirectoriesFilter(),
                new ExtensionListFilter( "html", "htm", "csv" ) );
        if ( commandLine.size() == 0 )
            throw new IllegalArgumentException( "No files found to
process\n" + USAGE );
        final boolean quiet = commandLine.isQuiet();
        for ( File file : commandLine )
                // -q gives no output at all, otherwise just files
that changed.
                ampifyFile( file, quiet ? 0 : 1 );
            catch ( FileNotFoundException e )
                out.println( "Error: "
                             + file.getAbsolutePath()
                             + " not found." );
            catch ( Exception e )
                out.println( e.getMessage()
                             + " in file "
                             + file.getAbsolutePath() );
            }// end for
        }// end main

