/** * Search Engine Program * @author Robert John Morton * @version 02-11 April 2020 * @copyright Robert John Morton Transcoded from original Java version dated 22 October 1999 This program requires a specialised index contained in a data file called index.dat located in the same directory as the program. It can search on the up to 16 entered keywords. The program may be downloaded either as a 64-bit executable for Unix/Linux systems or this source file may be insp- ected and compiled by the user. To compile: gcc search.c -o search To run: ./search keyword1 keyword2 keyword3 ... */ #include // input output to/from the console and keyboard #include // standard functions for the 'C' language #include // for initialising strings as character arrays char INDEX[65535], // 48kB array to hold 'index.dat' *H = "https://robmorton.website/", // url of site for internal hrefs *J = "http:localhost/", // url of local copy of website *V = "wget -q -O file.html https://robmorton.website/", // wget html file U[128][128], // array to hold the URLs of the files that match the keywords T[128], // title text of the current HTML file G[512], // array for capturing an HTML tag FP[128], // array to hold the local HTML file's full URL path FQ[128], // array to hold the remote HTML file's full URL path FV[128], // array to hold the wget command for the HTML files *fp, // points to where local file path begins in FP[] *fq, // points to where remote file path begins in FQ[] *fv, // points to where the wget command is stored KW[16][32], // array to hold the validated separate keywords KD[256], // array to hold the display string of validated keywords K[128]; // array to hold a single keyword int A1, B1, C1, D1, E1, N1, i, j, k, LF = 0, FSP[16], // ptr to start of index entries for each entered keyword FSE[16], // extent of index entries for each entered keyword RANK[128], // off-set pointers for HTML files containing keyword SIGF[128], // significance ranking for each HTML document KWF[16], // keyword found flags FTT = 1, // set to indicate first time through inTag = 0, // 1 = reading chararacters that are part of a tag name inTit = 0, // 1 = reading chararacters that are part of the file's title Gc = 0, // tag characters counter Tc = 0, // title characters counter TitFlg = 0, // 1 = got the complete title OK DscFlg = 0, // 1 = got the complete description OK I, // total number of HTML files in the results list R, // pointer to start of the URLs in the results list L; // the number of entered keywords FILE // file pointers [handles] *gfp, // general file pointer used for index.dat and html files *results; // file pointer for the results file /* CONVERT A 4-BYTE TRAIN TO AN INTEGER: 1. Promote each byte to an int 2. Chop off propagated 1s in case high bit set 3. Shift it into its correct position in the int 4. 'or' them all together to form the complete integer */ int getInt(int i) { int x0 = (int)INDEX[i ]; x0 &= 0xff; x0 <<= 24; int x1 = (int)INDEX[i + 1]; x1 &= 0xff; x1 <<= 16; int x2 = (int)INDEX[i + 2]; x2 &= 0xff; x2 <<= 8; int x3 = (int)INDEX[i + 3]; x3 &= 0xff; return x0 | x1 | x2 | x3; } /* GET THE Nth KEYWORD IN THE KEYWORD INDEX: Get the start pointer of the keyword's text, get the start pointer of the next keyword's text, return the length of the keyword. */ void getKeyword(int n) { int p = getInt(20 + ( n << 2)); int q = getInt(20 + (++n << 2)); int i = 0; while(p < q) // use the array K[] because it is now spare *(K + i++) = (char)INDEX[A1 + p++]; *(K + i) = '\0'; // terminate the string with a NULL character } /* Note that a filespec's pointer value is unique to it alone. Retrieve the HTML file's filespec reference from INDEX[]. */ int getFSref(int ptr, int ext) { int p = ptr + ext + ext, // form byte reference of required filespec x2 = (int)INDEX[p++]; // make the two adjacent bytes form x2 &= 0xff; // a 16-bit integer (short)rn x2 <<= 8; int x3 = (int)INDEX[p]; x3 &= 0xff; int rn = x2 | x3; return E1 + getInt(D1 + (rn << 2)); // return the start pointer } // of the filespec text /* This method embodies C A R Hoare's Quick Sort algorithm. Note that it is a highly re-entrant method: it calls itself indefinitely. */ void qs(int LO, int HI) { int lo = LO; // set moving lo to LO end of partition int hi = HI; // set moving hi to HI end of partition if (HI > LO) { // if the partition contains anything int mid = SIGF[(LO + HI) >> 1]; // get content of partition's mid element while(lo <= hi) { // loop through array until indices cross /* while current lowest significance < midway significance, push lower sort boundary up by one element and while current highest keyword > midway keyword, pull upper sort boundary down 1 element. */ while(lo < HI && SIGF[lo] < mid) lo++; while(hi > LO && SIGF[hi] > mid) hi--; if(lo <= hi) { // IF LOW INDEX <= HIGH INDEX SWAP THEIR 'CONTENTS' /* Sort by significance: get index (offset-extent-filenum) of lo element, put index of hi element in lo element, put index of lo element in hi element */ int x = SIGF[lo]; SIGF[lo] = SIGF[hi]; SIGF[hi] = x; /* Shift along with it the rank order: get index (offset- extent-filenum) of lo element,put index of hi element in lo element, put index of lo element in hi element */ x = RANK[lo]; RANK[lo] = RANK[hi]; RANK[hi] = x; lo++; // push lower sort boundary up by one element hi--; // pull upper sort boundary down by one element } } /* if hi not yet reached start of file, sort lower partition; if lo not yet reached end of file, sort upper partition. */ if(LO < hi) qs(LO, hi); if(lo < HI) qs(lo, HI); } } /* END TAG '>' CAPTURE HANDLER. When a '>' character is captured, it signifies the end of an HTML tag. Consequently, we have to determine what type of tag was captured and act accordingly. */ int endTag() { G[Gc] = '\0'; // put terminating NULL char at end of tag inTag = 0; // indicate that we are no longer inside a tag if(strcmp(G,"title") == 0) // if captured tag is a inTit = 1; // indicate we are now inside a title else if(strcmp(G,"/title") == 0) { // if captured tag is a T[Tc] = '\0'; // terminate the title string fprintf(results,"%s\n",FQ,T); inTit = 0; // we are now outside a title TitFlg = 1; // title acquired } /* Else, if we've just captured the '>' of the tag, we've finished with this HTML file, so 'return 1' in order to break out of the parent while loop in main(). */ else if(strcmp(G,"/head") == 0) return 1; else // Sample description tag: if(DscFlg == 0) { // if description captured but is not yet complete char *p, *q; if((strstr(G,"meta") == G) && (strstr(G,"description") != NULL) && ((p = strstr(G,"content")) != NULL) && ((q = strchr(p + 7,'\"') + 1) != NULL) && ((p = strchr(q,'\"')) > q)) { *p = '\0'; // terminate the description string with null char fprintf(results,"
%s

\n",q); DscFlg = 1; } } Gc = 0; // clear for the next tag to be encountered inTag = 0; // we are no longer inside a tag return 0; } /* This function simply writes all the HTML code that generates the heading, extended heading and relevant header tags for the HTML file 'results.html'. */ void header() { fprintf(results,"\n\n" "\n" "\n" "SEARCH RESULTS\n" "\n" "\n" "\n" "\n\n" "\n\n" "\n\n" "

\n\n" "
\n" "

\n" "\n" "\n" "
Robert John Morton

\n\n" "

Web-Book\n" "
The Lost Inheritance\n" "
My Poems\n\n" "

Projects\n" "
Landshare Project\n" "
Sustainable Food\n" "
Sustainable Energy\n\n" "

Interests\n" "
Global Navigation\n" "
Short-Wave Radio\n" "
Chaos Theory\n" "
The Universe\n" "
The Internet\n" "
My Software\n" "
My Ideal PC\n\n" "
Asperger's\n\n" "

Facilities\n" "
Home Page\n" "
Site Index" " | Search\n" "
About This Site\n" "
About Me, \n" "Email\n" "
Security †\n\n" "

\n\n" "

Search Results

\n\n" "

\n\nThese results were generated by a " "downloadable seach engine " "[written in 'C'] dedicated to searching this site. " "Until Java became 'discredited' an excellent " "seach engine applet " "was embedded on the site's search page.\n\n" "

\n\nKeywords: %s\n

\n\n" "
\n", H, H, H, H, H, H, H, H, H, H, H, H, H, H, H, H, H, H, H, H, H, H, H, H, H, KD); if(I == 0) printf("No occurrences found."); } /* Form the list number of the first/next HTML file summary as a 3-didgit string with leading zero[s] and then write it to the results file as the item number of the 'i'th file summary. */ void listNum() { static char C[4] = "000"; // 3-digit display string template for(j = 0; j < 3; j++) // make all 3 digits C[j] = '0'; // into '0' characters int x = i + 1, y = 0; // number of leading zeros j = 3; // possibe maximum of 3 characters to deal with if(x < 100) // if number less than 100, y = 1; // we've 1 less char to deal with if(x < 10) // if number less than 10, y = 2; // we've 2 less chars to deal with while(j > y) { // while there's yet a character to deal with /* Write to appropriate position in the 3-char array the remainder augmented into an ASCII character. */ C[--j] = (char)(x % 10 + 48); x /= 10; // divide the number by 10 [truncated] } fprintf(results,"
%s ",C); // print the result number } // RANK THE RESULTS INTO ORDER OF KEYWORD OCCURRENCE void rank() { I = FSE[0]; // number of URLs pertaining to first keyword R = FSP[0]; // pointer to the start of these URLs for(i = 0; i < I; i++) { // first, make order of rank = order RANK[i] = i; // of occurrence in main index SIGF[i] = 0; // and reset its significance to zero } if(L > 1) { // if more than one keyword was entered /* Sort the pointers to the HTML documents containing the first keyword into an order of ranking according to how many of the other entered keywords each document contains. i = entered keywords index, j = index to URLs of ith enterd keyword's URLs, k = index to */ for(j = 1; j < L; j++) { // for the 2nd entered keyword onwards if(KWF[j] = 1) { // if the keyword was found int M = FSE[j], // number of index entries for the keyword O = FSP[j]; // ptr to start of the keyword's URLs // for each of the HTML documents containing the first keyword for(i = 0; i < I; i++) { int r = getFSref(R, i); // get its filespec reference number for(k = 0; k < M; k++) // for each doc containing ith keyword if(r == getFSref(O,k)) // if same doc as contains 1st keyword ++SIGF[i]; // increment significance of this doc } } } // end of for(j = 1; j < L; j++) qs(0,I - 1); // sort RANK[] into order of significance of HTML docs } // end of if(L > 1) } /* Extract the title and description of one of the HTML files in the results list. */ void getTags() { // The following flags are true when getting DscFlg = 0; // 1 = got the complete description OK TitFlg = 0; // 1 = got the complete title OK Tc = 0; // title characters counter Gc = 0; // tag characters counter inTag = 0; // 1 = reading chararacters that are part of a tag name inTit = 0; // 1 = reading chararacters that are part of the file's title while(1) { // infinite loop broken by end-of-file encounter int c = fgetc(gfp); // read [next] character from the file if(feof(gfp)) break; // end of file encountered so bail out of loop if(c == '<') { // '<' marks start of tag inTag = 1; // indicate we are now receiving tag content Gc = 0; // zero the tag's character counter } else if(c == '>') { // '>' marks end of tag if(endTag() == 1) // if an tag has been captured break; // we've finished with this file } else if(inTag) { // if currently inside a tag if(Gc > 500) break; // bail out if tag buffer full G[Gc++] = c; // add char to G string } else if(inTit && c != '\n') { // if inside title and chr not a '\n' if(Tc > 120) break; // exit if title buffer full T[Tc++] = c; // add current char to text } } // end of while(1) loop fclose(gfp); // close the HTML file // output the hyperlinked title to the results file. if(TitFlg == 0) fprintf(results, "<a href=\"%s\">%s</a></dt>\n",FQ,FQ); // output file description between <dd></dd> tags. if(DscFlg == 0) fprintf(results,"<dd><div><font color=red>No Description</font>" "</div></dd><p>\n"); } void main(int argc, char *argv[]) { fp = FP + strlen(J); // points to where the local path begins fq = FQ + strlen(H); // points to where the remote file path begins fv = FV + strlen(V); // points to where the wget command begins /* INPUT THE KEYWORDS. The command line arguements are the keywords. argc = number of keywords entered + 1. 'argv[n]' contains the nth keyword. If only one keyword is entered, then argc = 2 and arg[1] contains that keyword. */ L = argc - 1; int l, m = 0; char c, *p, *q; if(L > 15) L = 15; // accept only up to 16 keywords for(i = 0; i < L; i++) { // for each of the entered keywords p = argv[i + 1]; // pointer to next input keyword l = strlen(p); // find its length [number of characters] if(l > 30) l = 30; // maximum keyword length q = KW[i]; // pointer to where to put this keyword for(j = 0; j < l; j++) { // for each character in the keyword c = *(p + j); // get the character if(c > '@' && c < '[') // if c is an upper case letter c += 32; // make it lower case if ((c > '/' && c < ':') // if c is a number || (c > 96 && c < '{')) { // or a lower case letter *(q + j) = c; // add it to current keyword *(KD + m + j) = c; // add it to keywords string } else if(c == '-') { // if '-' or '-l' switch entered LF = 1; // set for loading from local files L--; // Reduce 'number of keywords entered' break; // to exclude the switch from being } // regarded as a keyword. } if(LF == 1) break; // bail out of using local file copies *(q + l) = '\0'; // put termination NULL at end of keyword *(KD + m + l) = ' '; // add space after this keyword in keywords string m += l + 1; // move base pointer up to start of next keyword } *(KD + m + 1) = '\0'; // terminate keywords string with a NULL character // Load the index data from the file 'index.dat' if(LF == 0) // if 'local' switch unset, download 'index.dat' from server system( "wget -q -O index.dat " "https://robmorton.website/" "software/C-programs/search/index.dat" ); gfp = fopen("index.dat","rb"); // open the index file for reading i = 0; // initialise byte counter while(feof(gfp) == 0) { // while end of file not yet reached if(i > 65535) { // if no space left in INDEX[] array printf("Index too big: enlarge INDEX[] and recompile.\n"); return; // bail out safely } INDEX[i++] = fgetc(gfp); // add current byte to the INDEX[] array } INDEX[i] = '\0'; // terminate the index with a NULL just in case fclose(gfp); // close 'index.dat' // Set the byte number pointers for accessing INDEX[] A1 = getInt( 0); // start byte of the keyword text stream N1 = (A1 >> 2) - 1; // total number of keywords in the index B1 = getInt( 4) + 20; // start byte of filespec reference pointers C1 = getInt( 8) + 20; // start byte of filespec reference numbers D1 = getInt(12) + 20; // start byte of filespec text pointers E1 = getInt(16) + 20; // start byte of filespec text stream A1 += 20; // bias over first 20 bytes holding A1,B1,C1,D1,E1 // Get the entered keywords from the console. KWIN: if(L == 0) { // if no keywords were entered on the command line k = 0; // set index to first char for first keyword printf("\n\nEnter search keywords as one " "line of space-delimited words:\n"); fgets(KD, 256, stdin); // get the input string from the colsole l = strlen(KD); // length of console input string for(i = 0; i < l; i++) { // for each char in console input string if(L > 16) break; // bail out if 16 keywords already processed c = KD[i]; // get next character from input string if(c == '\n') { // if end-of-input new-line encountered KW[L++][k] = '\0'; // put it as a NULL char at end of keyword break; // bail out because we have all the keywords } else if(c == ' ') { // if a space character encountered KW[L][k] = '\0'; // terminate keyword with a NULL character L++; // move up to next keyword k = 0; // and reset letter index to first letter } else KW[L][k++] = c; // else add new char to current keyword } } printf("Searching "); // 'Searching <URL>' if(LF == 0) printf("%s\n", H); else printf("%s\n", J); /* Do a binary slice search if the INDEX[] array to try to find each entered keyword*/ for(i = 0; i < L; i++) { // for each entered keyword int // TRY TO FIND ENTERED KEYWORD IN INDEX[] Flag = 0, // 1:keyword found x, // 3-state string comparison variable n = N1 >> 1, // start with keyword half way up INDEX[] j = n; // jump size starts at half INDEX[] size // Try to find keyword by binary slicing: while((j >>= 1) > 0) { // While halved jump size still > 0 ... getKeyword(n); // make 'K' point to nth keyword in INDEX[] x = strcmp(KW[i],K); // compare entered word with word in INDEX[] if(x > 0) // if 's' is higher than 'K' n += j; // the one we're looking for must be higher else if(x < 0) // if 's' is lower than 'K' n -= j; // the one we're looking for must be lower else { // else keyword found Flag = 1; // so set the 'found' flag break; // and beak out of the while() loop } } // end of while() /* Continue with an inch-up/inch-down search of INDEX[] if the binary slice search failed to find the keyword. */ if(Flag == 0) { int up = 0, dn = 0; // 1:going up / going down respectively while(up == 0 || dn == 0) { // while not yet reversed direction getKeyword(n); // make 'K' point to nth keyword in INDEX[] x = strcmp(KW[i], K); // compare entered word 's' with word in INDEX[] if(x > 0) { // if KW[i] is higher than 'K' if(++n >= N1) // if moving up would overshoot top of INDEX break; // exit while() loop: keyword not found up = 1; // if not, set that we have moved up a peg } else if(x < 0) { // if 's' is higher than 'K' if(--n < 0) // if moving down would undershoot bottom of INDEX break; // exit while() loop: keyword not found dn = 1; // if not, set that we have moved up a peg } else { // else keyword found Flag = 1; // so set the 'found' flag break; // and beak out of the while() loop } } // end of while() } // end of if(Flag == 0) { /* Store INDEX[] start byte of each entered keyword's files list and the number of files that contain that keyword. */ if(Flag == 1) { KWF[i] = 1; // keyword found int y = C1 + getInt(B1 + (n << 2)); FSE[i] = (C1 + getInt(B1 + (++n << 2)) - y) >> 1; FSP[i] = y; } else { KWF[i] = 0; // keyword not found FSE[i] = 0; // zero URLs pertain to this keyword } } // end of for(i =) loop rank(); // rank the results URLs in order of keyword occurrence /* Place the THE URLs of the HTML files that comprise the search results into the output array U[][] */ for(i = 0; i < I; i++) { // for each of these URLs: int p = R + i + i; // form byte reference of required filespec int x2 = (int)INDEX[p++]; x2 &= 0xff; // make the two adjacent bytes form x2 <<= 8; // a 16-bit integer (short)rn int x3 = (int)INDEX[p]; x3 &= 0xff; int rn = x2 | x3, P = E1 + getInt(D1 + ( rn << 2)), // start pointer of filespec text Q = E1 + getInt(D1 + (++rn << 2)); // end pointer of filespec text j = 0; while(P < Q) // transfer the filespec U[RANK[i]][j++] = (char)INDEX[P++]; // into the U[] char array U[RANK[i]][j] = '\0'; // terminate it with a NULL char } /* Try to open the results file 'results.html' for writing. If the file opening attempts fails, write an appropriate error message to the terminal.*/ results = fopen("results.html","w"); // open results file for writing if(results == NULL) { // couldn't open it printf("Can't open results file.\n"); // so show error message return; // and exit } header(); // write the results file's HTML header information /* For each HTML file in the search results list, display its list number in the terminal, write its list number as a 3-digit string with leading zeros, into the results file, extract its title and description and write them to the results file. */ for(i = 0; i < I; i++) { printf("\rFound %i relevant files. ",i + 1); fflush(stdout); // display in terminal number of results found so far listNum(); // write item number of file summary to results file k = I - i - 1 ; // reverse the order: most significant file first strcpy(FQ,H); // copy remote base URL into the full file path strcpy(fq,U[k]); // add the local file path into full URL path if(LF == 0) { // if using remote website to get HTML files strcpy(FV,V); // copy base wget command strcpy(fv,U[k]); // add the local file path to it system(FV); // download HTML file from remote server gfp = fopen("file.html","r"); } else { // if using local website to get HTML files strcpy(FP,J); // copy local base URL into full file path strcpy(fp,U[k]); // add local file path into full URL path gfp = fopen(FP,"r"); // open current HTML file for reading } if(gfp == NULL) { // if can't open the HTML file printf("%s\n","Can't open HTML file."); return; // display error message and exit } getTags(); // extract the title and description from the HTML file } /* Write final <dl> & end tags in the results file, close 'results.html' and display the results file in the Firefox web browser. */ fprintf(results,"</dl>\n</div>\n</body>\n</html>\n"); fclose(results); system("xdg-open results.html &"); L = 0; // set number of keywords entered to zero goto KWIN; // and go back to input new keywords }