Refactoring exercise.
This is a beginner's exercise in refactoring.
Below is posted the text for Amper.
You can get any other code it needs or find out more about what it is
for from http://mindprod.com/products1.html#AMPER
Notice how similar the methods ampifyPossiblyCommentedString and
ampifyPossiblyScriptedString are.
Your task is to refactor out that commonality in handling of text
inside/outside markers so that the logic appears only once.
Hint: see http://mindprod.com/jgloss/callback.html
This is aimed at newbies. If you are an old-timer, see if you could
come up with something clever to make the code more readable and
terse.
You don't have to show a complete working program, just the refactored
logic.
/*
* [Amper.java]
*
* Summary: amper, converts invalid & to & in html.
*
* Copyright: (c) 1999-2012 Roedy Green, Canadian Mind Products,
http://mindprod.com
*
* Licence: This software may be copied and used freely for any
purpose but military.
* http://mindprod.com/contact/nonmil.html
*
* Requires: JDK 1.5+
*
* Created with: JetBrains IntelliJ IDEA IDE
http://www.jetbrains.com/idea/
*
* Version History:
* 1.1 2006-03-05
* 1.2 2007-03-26 fix bug in StripEntities. Was not doing 
properly.
* 1.3 2007-04-07 recover from crash. Tidy code.
* 1.4 2007-05-10 add icon, PAD file.
* 1.5 2007-06-29 add -q command line support. New CommandLine
interface.
* 1.6 2008-08-03 change detail parameter so that you can request
three levels of detail, rather than two.
* 1.7 2012-01-25 now handles HTML5 entities. It now leaves any
unusual entities as is.
* 1.8 2012-02-09 fix bug. Now handles even very longest HTML5
entities. No longer extends DeEntifyStrings.
* 1.9 2012-06-18 allow you to ampify .htm and .csv files
* 2.0 2012-11-03 deal text inside <script is no longer ampified.
* new methods ampifyPossiblyScriptedString(String)
ampifyPossiblyCommentedString(String)
* deprecated ampifyCommented.
*/
package com.mindprod.amper;
import com.mindprod.commandline.CommandLine;
import com.mindprod.common11.Misc;
import com.mindprod.filter.AllButSVNDirectoriesFilter;
import com.mindprod.filter.ExtensionListFilter;
import com.mindprod.hunkio.HunkIO;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.lang.System.out;
/**
* amper, converts invalid & to & in html.
* <p/>
*
* @author Roedy Green, Canadian Mind Products
* @version 2.0 2012-11-03 deal text inside <script is no longer
ampified.
* new methods ampifyPossiblyScriptedString(String)
ampifyPossiblyCommentedString(String)
* deprecated ampifyCommented.
* @noinspection WeakerAccess
* @since 1999
*/
public final class Amper
{
// ------------------------------ CONSTANTS
------------------------------
/**
* true if want extra debug output
*/
private static final boolean DEBUGGING = false;
/**
* Longest an HTML 5 entity can be, at least in our tables,
including the lead & and trail ;.
*/
static final int LONGEST_HTML5_ENTITY =
"∳".length();
/**
* undisplayed copyright notice.
*
* @noinspection UnusedDeclaration
*/
public static final String EMBEDDED_COPYRIGHT =
"Copyright: (c) 1999-2012 Roedy Green, Canadian Mind
Products, http://mindprod.com";
/**
* date this version released.
*
* @noinspection UnusedDeclaration
*/
private static final String RELEASE_DATE = "2012-06-18";
/**
* how to use the command line
*/
private static final String USAGE = "Amper needs a filename.html
or a space-separated list of filenames, " +
"with optional -s -q -v
switches";
/**
* embedded version string.
*
* @noinspection UnusedDeclaration
*/
public static final String VERSION_STRING = "1.9";
/**
* pattern to detect entity less lead & with trail ; alpha, &#x
hex or &# numeric
*/
private static final Pattern ENTITY_PATTERN = Pattern.compile(
"\\p{Alnum}{2," + ( LONGEST_HTML5_ENTITY - 2 ) +
"};|#x[0-9a-fA-F]{1,8};|#\\p{Digit}{1,10};" );
// -------------------------- PUBLIC STATIC METHODS
--------------------------
/**
* convert all & except ones in comments to &.
*
* @param big string possibly containing & and comments, but no
<scripts
*
* @return compacted string.
* @noinspection WeakerAccess
* @see #ampifyPossiblyScriptedString(String)
* @see #ampifyPossiblyCommentedString(String)
* @see #ampifyUncommentedString(String)
* @deprecated renamed to ampifyPossiblyCommentedString . You
probably really want
* ampifyPossiblyScriptedString.
*/
public static String ampifyCommented( String big )
{
return ampifyPossiblyCommentedString( big );
}
/**
* fix amps in one file.
*
* @param fileBeingProcessed the file currently being processed.
* @param detail 0=out output at all, 1=just files
changed, 2=all files.
*
* @throws IOException if trouble reading or writing file
* @noinspection SameParameterValue, WeakerAccess
* @see #ampifyPossiblyScriptedString(String)
* @see #ampifyPossiblyCommentedString(String)
* @see #ampifyUncommentedString(String)
*/
public static void ampifyFile( File fileBeingProcessed,
int detail ) throws IOException
{
if ( !( fileBeingProcessed.getName().endsWith( ".html" )
|| fileBeingProcessed.getName().endsWith( ".htm" )
|| fileBeingProcessed.getName().endsWith( ".csv" ) ) )
{
out.println( "Cannot amp: "
+ fileBeingProcessed.getName()
+ "not .html .htm .csv file" );
return;
}
String big = HunkIO.readEntireFile( fileBeingProcessed );
String result = ampifyPossiblyScriptedString( big );
if ( result.equals( big ) )
{
// nothing changed. No need to write results.
if ( detail >= 2 )
{
out.println( "- " + fileBeingProcessed.getName() );
}
return;
}
// generate output into a temporary file until we are sure all
is ok.
// create a temp file in the same directory as filename
if ( detail >= 1 )
{
// it changed
out.println( "* " + fileBeingProcessed.getName() );
}
final File tempFile = HunkIO.createTempFile( "temp", ".tmp",
fileBeingProcessed );
FileWriter emit = new FileWriter( tempFile );
emit.write( result );
emit.close();
// successfully created output in same directory as input,
// Now make it replace the input file.
Misc.deleteAndRename( tempFile, fileBeingProcessed );
}
/**
* convert all & except ones in comments to &.
*
* @param big string possibly containing & and comments, but no
<scripts
*
* @return tidied string.
* @noinspection WeakerAccess
* @see #ampifyPossiblyScriptedString(String)
* @see #ampifyUncommentedString(String)
*/
public static String ampifyPossiblyCommentedString( String big )
{
int originalLength = big.length();
final StringBuilder sb = new StringBuilder( originalLength );
// indexes which character we are working on
int i = 0;
while ( i < originalLength )
{
// search for start of comment
int startCommentPlace = big.indexOf( "<!--", i );
if ( startCommentPlace < 0 )
{
// no more comments, finish off this last chunk
sb.append( ampifyUncommentedString( big.substring( i,
originalLength ) ) );
break;
}
// we found the start of a comment
// process html in front of comment, possibly empty
sb.append( ampifyUncommentedString( big.substring( i,
startCommentPlace ) ) );
// find the end of comment
int endCommentPlace =
big.indexOf( "-->", startCommentPlace +
"<!--".length() );
if ( endCommentPlace < 0 )
{
throw new IllegalArgumentException( "missing --> on a
comment" );
}
endCommentPlace += "-->".length();
String commentText = big.substring( startCommentPlace,
endCommentPlace );
// make sure the comments not malformed. Should be no
embedded start
// comment marker
String commentGuts = commentText.substring(
"<!--".length(), commentText.length() - "-->".length() );
if ( commentGuts.contains( "<!--" ) )
{
throw new IllegalArgumentException( "<!-- ... --> not
balanced" );
}
// output the comment unchanged
sb.append( commentText );
i = endCommentPlace;
}// end while
return sb.toString();
}
/**
* convert all & except ones in comments or inside <script to
&.
*
* @param big string possibly containing & and comments and
<scripts
*
* @return tidied string.
* @noinspection WeakerAccess
* @see #ampifyPossiblyCommentedString(String)
* @see #ampifyUncommentedString(String)
*/
public static String ampifyPossiblyScriptedString( String big )
{
int originalLength = big.length();
final StringBuilder sb = new StringBuilder( originalLength );
// indexes which character we are working on
int i = 0;
while ( i < originalLength )
{
// search for start of <script
int startScriptPlace = big.indexOf( "<script", i );
if ( startScriptPlace < 0 )
{
// no more scripts, finish off this last chunk
sb.append( ampifyPossiblyCommentedString(
big.substring( i,
originalLength ) ) );
break;
}
// we found the start of a <script
// process html in front of <script, possibly empty
sb.append( ampifyPossiblyCommentedString( big.substring(
i,
startScriptPlace ) ) );
// find the end of script
int endScriptPlace =
big.indexOf( "</script>", startScriptPlace +
"</script>".length() );
if ( endScriptPlace < 0 )
{
throw new IllegalArgumentException( "missing
</script>" );
}
endScriptPlace += "</script>".length();
String scriptText =
big.substring( startScriptPlace, endScriptPlace );
// make sure the <scripts not malformed. Should be no
embedded start marker
String scriptGuts = scriptText.substring(
"script".length(), scriptText.length() - "</script>".length() );
if ( scriptGuts.contains( "script" ) )
{
throw new IllegalArgumentException( "<script ...
</script> not balanced" );
}
// output the script unchanged
sb.append( scriptText );
i = endScriptPlace;
}// end while
return sb.toString();
}
/**
* convert all & to & unless it has been done already. Leaves
existing
* entities as is.
*
* @param chunk the string to process
*
* @return tidied string
* @noinspection WeakerAccess
* * @see #ampifyPossiblyScriptedString(String)
* @see #ampifyPossiblyCommentedString(String)
*/
public static String ampifyUncommentedString( String chunk )
{
// do a quick check. If chunk contains no &, we have nothing
to do,
// guaranteed
if ( !chunk.contains( "&" ) )
{
return chunk;
}
int length = chunk.length();
final StringBuilder sb2 = new StringBuilder( length + 20 );
int i = 0;
while ( i < length )
{
int ampPlace = chunk.indexOf( "&", i );
if ( ampPlace < 0 )
{
// all done, copy over the remaining chunk.
sb2.append( chunk.substring( i, length ) );
// don't need to increment i
break;
}
// we found an &
// copy over stuff before the & we just found
sb2.append( chunk.substring( i, ampPlace ) );
i = ampPlace;
// is it an & or < or some other entity already?
// get string without lead & but with trailing ; if it
exists.
final String candidate = chunk.substring( i + 1, Math.min(
i + LONGEST_HTML5_ENTITY, length ) );
final Matcher m = ENTITY_PATTERN.matcher( candidate );
// quick test. Just check pattern starting just after &
if ( m.lookingAt() )
{
// this was an entity already, leave it alone.
sb2.append( '&' );
}
else
{
// convert & to &
sb2.append( "&" );
}
i++;
}// end while
return sb2.toString();
}
// --------------------------- CONSTRUCTORS
---------------------------
/**
* constructor, not used.
*
* @noinspection WeakerAccess
*/
private Amper()
{
}
// --------------------------- main() method
---------------------------
/**
* fixes ampersands in HTML files.
*
* @param args names of files to process, dirs, files, -s, *.*, no
wildcards.
*/
public static void main( String[] args )
{
if ( DEBUGGING )
{
out.println( ENTITY_PATTERN.toString() );
}
// gather all the files mentioned on the command line.
// either directories, files, with -s and subdirs option.
// warning. Windows expands any wildcards in a nasty way.
// do not use wildcards.
// See http://mindprod.com/jgloss/wildcard.html
out.println( "Gathering html files to &ify..." );
CommandLine commandLine = new CommandLine( args,
new AllButSVNDirectoriesFilter(),
new ExtensionListFilter( "html", "htm", "csv" ) );
if ( commandLine.size() == 0 )
{
throw new IllegalArgumentException( "No files found to
process\n" + USAGE );
}
final boolean quiet = commandLine.isQuiet();
for ( File file : commandLine )
{
try
{
// -q gives no output at all, otherwise just files
that changed.
ampifyFile( file, quiet ? 0 : 1 );
}
catch ( FileNotFoundException e )
{
out.println( "Error: "
+ file.getAbsolutePath()
+ " not found." );
}
catch ( Exception e )
{
out.println( e.getMessage()
+ " in file "
+ file.getAbsolutePath() );
}
}// end for
}// end main
}
--
Roedy Green Canadian Mind Products http://mindprod.com
Ironically, even though the Internet was created by the US military
[DARPA (Defense Advanced Research Projects Agency)]
to withstand a nuclear attack, it is almost defenceless against malice
from any of its users