/** * HTML Tag Checker * @author Robert J Morton * @version 10 July 2018 * @copyright Sep 2018 Robert J Morton (all rights reserved) */ /* This program checks the integrity of tag-pairs in HTML files. It scans each HTML file in the website, capturing all tags of the types that must occur in pairs eg. . Any such tags encountered that do not have their appropriate end-tags are listed in the output file tag_checker.txt under the name of the file concerned. This program scans all the HTML files in the website and checks that the HTML tags that need to have corresponding end-tags occur in appropriate opening and closing pairs. References to any files with unpaired tags are placed in the file "badtags.txt" in this directory. The tags actually tested are:
,
, , , , , , <body>, <cite>, <code>, <font>, <head>, <html>, <nobr>, <samp>, <big>, <div>, <pre>, <sub>, <sup>, <em>, <h1>, <h2>, <h3>, <h4>, <tt>, <b>, <i>, <u> Others can be added to the string array at the top of the source file. It is then necessary to recompile the source for the new tags to be included in scans. */ import java.io.*; class tag_checker { private static Writer bt, // for output text file tb; // to capture files containing <table> tag private static FileReader fr; // for each HTML file scanned /* The 29 HTML tags that must have corresponding end-tags. They must be placed in this array in alphabetical order within order of length, with the longer words appearing first. */ private static final String Tags[] = { "blockquote","address","strong","lists","small","table","title", "body","cite","code","font","head","html","nobr","samp","big", "div","pre","sub","sup","em","h1","h2","h3","h4","tt","b","i","u" }; private static int TagNest[] = new int[Tags.length]; private static boolean inTag = false, TABLE = false; private static String tag = "", // for accumulating the characters of a captured tag bd = "", // first command line arguement [parent file path] sd = "", // 2nd command line arguement [directory to be word-counted] fp, // full path to HTML file being examined for tag-pair errors badTags = ""; private static int dl = 0, // length of parent directory path name + a terminating '/' tagsense = 0; // +1 = start tag; -1 = end tag /* Create a file reader for the next HTML file to be scanned, then read in the bytes from it, accumulating any text between '<' and '>' characters and testing what is thus captured to see if it is one of the tag-types that must have a corresponding end-tag. An integer stack techique is em- ployed to achieve this. Each tag-type has a 'number of occurrences ele- ment within the integer array TagNest[], which is incremented every time a tag of that type is encountered and decremented every time its end-tag version is encountered. Thus, at the end of the file, if each tag has its corresponding end-tag, the value in its TagNest[] element will be zero. If it is not, then there is a missing end-tag somewhere for this tag-type within the file currently being scanned. Called by itself and from only one place in scan(). */ static String tagCapture() { int c = 0; // for input character // Zero all elements of the TagNest array. for(int i = 0; i < Tags.length; i++) TagNest[i] = 0; try { fr = new FileReader(fp); // Create a new file reader. /* Then read and examine each character in turn from it until the end-of-file is encountered. */ while((c = fr.read()) != -1) { if(c == '<') { // If the new char is an HTML tag start-bracket '<', inTag = true; // set the 'in tag' flag and tag = ""; // clear the tag string. } else // Else, if we are currently inside if(inTag) { // a tag [between a '<' and a '>']: /* If the new character is a tag terminator baracker '>', we have reached the end of the tag name, so clear the 'in tag' flag and proceed to analyse what we have captured. */ if(c == '>') { inTag = false; /* If the tag can have parameters, these occur after the space that follows the tag-name. We only want the tag-name, so cut off the tag text upto but excluding the first encountered 'space' character. */ int y = tag.indexOf(' '); if(y != -1) tag = tag.substring(0,y); if(tag.equals("table")) // if the captured tag is a <table> TABLE = true; // set the TABLE flag to show this /* If the tag we have just captured is the end-tag of a tag- pair [ie it begins with a slash], chop off the initial slash and decrement the nesting number for this type of tag. */ if(tag.startsWith("/")) { tag = tag.substring(1, tag.length()); tagsense = -1; } else // Else the tag just found must be a start-tag, tagsense = 1; // so increment nesting number for this tag type. /* Find the index number of the captured tag's type and increment/decrement the nesting number for this tag. */ for(int i = 0; i < Tags.length; i++) if(tag.equals(Tags[i])) { TagNest[i] += tagsense; break; } } else // Else add the new character tag += (char)c; // to the tag accumulator string. } // end of if(inTag) } // end of while() loop fr.close(); // close the file reader } catch(Exception e) { } // catch 'end-of-file' exception /* Put the names of any unmatch tags found in this file into a string and return the sting tagList. */ badTags = ""; for(int i = 0; i < Tags.length; i++) { if(TagNest[i] != 0) badTags += ", <" + Tags[i] +">"; } /* Provided that at least one unpaired tag was captured, chop the initial comma + space from the first tag. Then write the full path of the file, plus the list of unpaired tags within it, to the output file.*/ if(!badTags.equals("")) { badTags = badTags.substring(2, badTags.length()); return fp + "\n" + badTags + "\n\n"; } return ""; } /* When invoked, it examines the files and directories contained within the directory 'd' passed to it as its parameter. If an entry is an HTML file, it calls the tagCapture method below. The 'relative' filespec is the path +filename from the point of view of the current directory. If an entry is a directory, it simply calls itself to deal with that (sub) directory as it is doing with the current directory. Thus it can handle any depth of sub-directories from the parent. This method is re-entrant. It calls itself. It is also called from one place only in main().*/ private static void scan(String d) throws IOException { char ch = ' '; File fd = new File(d); // create file object for given directory name String D[] = fd.list(); // list all items in this directory /* For each HTML file in the sub-directory, get name of [next] sub-directory and create a file object for it. */ for(int i = 0; i < D.length; i++) { fp = d + "/" + D[i]; // full path to HTML file being examined File fs = new File(fp); /* Provided it is an existing directory and it isn't a development directory, then re-enter this method. */ if(fs.isDirectory() && !fp.endsWith("webtools") && !fp.endsWith("images") && !fp.endsWith("applets") && !fp.endsWith("java_progs") && !fp.endsWith("C-programs")) scan(fp); /* ELSE it should be a file. So, if the file exists and it is an HTML file, call the tagCapture() method below. */ else if(fs.isFile() && fp.endsWith(".html")) { String s = tagCapture(); if(!s.equals("")) bt.write(s); if(TABLE) { TABLE = false; tb.write(fp + '\n'); } } } // end of the for() loop } public static void main(String args[]) throws Exception { /* Provided at least the two mandatory command line arguements have been entered, set the first arguement as the name of the given document base directory and the second arguement as the sub-directory to be word-counted. */ bd = "/home/rob/Private"; sd = "website"; String d = bd + "/" + sd; dl = bd.length() + 1; // form the full path and note its length + 1 File pd = new File(d); // create a file object from the full path /* If command line argument is an existing directory, open the output file writer in order to be able to use write() */ if(pd.isDirectory()) { bt = new FileWriter("badtags.txt"); tb = new FileWriter("table_tags.txt"); scan(d); // scan for HTML files in the specified directory tree bt.close(); tb.close(); } else System.out.println(d + " is not a directory."); } }