/** * Web Site Titles and Descriptions Length Checker for HTML files.. * @author Robert John Morton * @version 22 May 2019 */ /* This program list all html files with: 1) titles shorter than 60 characters in title_too_short.txt 2) titles longer than 70 characters in title_too_long.txt 3) descriptions shorter than 140 characters in descr_too_short.txt 4) descriptions longer than 160 characters in descr_too_long.txt */ import java.io.*; class titles_checker { private static Writer ttl, // for HTML files with titles too long tts, // for HTML files with titles too short dtl, // for HTML files with descriptions too long dts; // for HTML files with descriptions too short private static Reader aif; // file object for 'articles_index.html' private static String fp = "../../articles-index/", fn = "articles_index.html", tag = "", // for capturing tag name text DT = "", // for capturing DT text DD = "", // for capturing DD text path = "", // title entry's href path title = "", // title entry's title text s = ""; // used to form the length of the title or description private static int c = 0, // for input character tl = 0, // title length dl = 0, // description length L = 0; // length of title text private static boolean inTag = false, // True indicates we are inside the <> brackets inDT = false, // True indicates we are reading the title text inDD = false; // True indicates we are reading the title text /* Format of the title section of an entry in articles_index.html
001 7-day Week: Its Relation to Moon Phases
Called only from one place in main(). */ private static void parseDT() throws Exception { DT = DT.trim(); int x = 0, ldt = DT.length(); if(ldt > 22) { // length of extraneous tags /* Chop off the extraneous tags and trim spurious white-space characters such as C/Rs, NULLs and EOF from the title entry. */ DT = DT.substring(13,ldt - 8).trim(); if((x = DT.indexOf(">")) != -1) { //'>' char link and title text /* Chop off the extraneous tags and trim spurious white-space characters such as C/Rs, NULLs and EOF from the path. */ path = DT.substring(0,x).trim(); /* Chop off the extraneous tags and trim spurious white-space characters such as C/Rs, NULLs and EOF from the title text. */ title = DT.substring(x + 1,DT.length()).trim(); /* Form the length of the title text as a 3-digit string with leading zeros where necessary. */ int l = title.length(); s = "" + l; if(l < 100) s = "0" + l; if(l < 10) s = "0" + l; // Form the output entry of the link + title text // s = s + " " + path + "\n" + title + "\n\n"; s = path + " " + l + "\n"; if(l < 60) tts.write(s); // if title too short else if(l > 70) ttl.write(s); // if title too long } } DT = ""; inDT = false; } /* Format of the description section of an entry in articles_index.html
[23 characters] Today is always a little bit longer than yesterday and tomorrow will be a little bit longer than today.
[11 characters] Called from only one place in main(). */ private static void parseDD() throws Exception { DD = DD.trim(); int x = 0, ldd = DD.length(); if(ldd > 34) { // length of extraneous tags /* Chop off the extraneous tags and trim spurious white-space characters such as C/Rs, NULLs and EOF from description text. */ DD = DD.substring(20,ldd - 10).trim(); /* Form the length of the title text as a 3-digit string with leading zeros where necessary. */ int l = DD.length(); s = "" + l; if(l < 100) s = "0" + l; if(l < 10) s = "0" + l; // Form the output entry of the link + title text // s = s + " " + path + "\n" + DD + "\n\n"; s = path + " " + l + "\n"; if(l < 140) //if description too short dts.write(s); else if(l > 160) //if description too long dtl.write(s); } DD = ""; // clear description string ready for next pass inDD = false; // and clear the 'doing description' flag } public static void main(String args[]) throws Exception { /* Create a file reader for article_index.html and file writers for the four results files. */ aif = new FileReader("../../articles-index/articles_index.html"); ttl = new FileWriter("title_too_long.txt"); tts = new FileWriter("title_too_short.txt"); dtl = new FileWriter("descr_too_long.txt"); dts = new FileWriter("descr_too_short.txt"); /* Read and examine each each character in turn from art- icles_index.html until its end-of-file is encountered. */ try { while((c = aif.read()) != -1) { if(c == '<') { // If the new char is an HTML tag start-bracket '<' inTag = true; // set the 'in tag' flag tag = ""; // clear the tag string } else // else, if we are currently insid if(inTag) { // a tag [between a '<' and a '>'] ... if(c == '>') { // if new character is a tag terminator inTag = false; // baracket '>', clear the 'in tag' flag /* If we have just captured a
or a
tag, switch to capturing title or description respectively, otherwise, if we have just captured a or a
tag, go and parse the respective content. */ if(tag.equals("dt")) inDT = true; else if(tag.equals("dd")) inDD = true; else if(tag.equals("/dt")) parseDT(); else if(tag.equals("/dd")) parseDD(); } else tag += (char)c; // else add the new char to tag string } /* Add the current character to the appropriate capture string for the title or description respectively. */ if(inDT) DT += (char)c; else if(inDD) DD += (char)c; } } catch(Exception e) { } // catch 'end-of-file' exception // Close all files. aif.close(); ttl.close(); tts.close(); dtl.close(); dts.close(); } }