next next (not done)

hb2143 [2005-06-02 20:17:58]
next next (not done)
Filename
.classpath
config/content extractor settings.ini
src/psl/crunch3/Crunch3Settings.java
src/psl/crunch3/MainWindow.java
src/psl/crunch3/plugins/contentextractor/ContentExtractor.java
diff --git a/.classpath b/.classpath
index 035237a..9ebf048 100644
--- a/.classpath
+++ b/.classpath
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <classpath>
-	<classpathentry kind="src" path="src"/>
+	<classpathentry excluding="psl/crunch3/util/wrapperinduction/" kind="src" path="src"/>
 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
 	<classpathentry kind="lib" path="jars/swt.jar"/>
 	<classpathentry kind="lib" path="jars/nekohtml-0.8.2/nekohtml.jar"/>
diff --git a/config/content extractor settings.ini b/config/content extractor settings.ini
index b59ea67..a6af337 100644
--- a/config/content extractor settings.ini
+++ b/config/content extractor settings.ini
@@ -1,42 +1,42 @@
 #Content Extractor Settings File
-#Wed Apr 06 16:16:47 EDT 2005
+#Thu Jun 02 15:24:02 EDT 2005
 <A>\ tags\ are\ substance=true
-<FORM>\ tags\ are\ substance=false
+<FORM>\ tags\ are\ substance=true
 Ignore\ Image\ Links=true
 Ignore\ Meta\ Tags=true
-Display\ Image\ Link\ ALTs=false
+Display\ Image\ Link\ ALTs=true
 Ignore\ Text\ Links=true
 <INPUT>\ tags\ are\ substance=true
-Ignore\ <IFRAME>\ Tags=false
+Ignore\ <IFRAME>\ Tags=true
 <BUTTON>\ tags\ are\ substance=true
 Ignore\ All\ Advertisements=true
 <SELECT>\ tags\ are\ substance=true
-Ignore\ External\ Stylesheets=false
+Ignore\ External\ Stylesheets=true
 <IFRAME>\ tags\ are\ substance=true
-Ignore\ <EMBED>\ tags=false
-Ignore\ Styles=false
+Ignore\ <EMBED>\ tags=true
+Ignore\ Styles=true
 Ignore\ Only\ Links\ and\ Text\ in\ Link\ Lists=true
 Ignore\ <INPUT>\ Tags=true
-Ignore\ Flash=false
+Ignore\ Flash=true
 Ignore\ Image\ Links\ in\ Link\ Lists=true
 <IMG>\ tags\ are\ substance=true
 <TEXTAREA>\ tags\ are\ substance=true
 Ignore\ Scripts=true
 Ignore\ Forms=true
-Display\ Image\ ALTs=false
+Display\ Image\ ALTs=true
 Maximum\ Number\ of\ Line\ Breaks=2
 Ignore\ <BUTTON>\ Tags=true
-Limit\ Number\ of\ Line\ Breaks=true
-Ignore\ Style\ Attribute\ in\ <DIV>\ Tags=false
+Limit\ Number\ of\ Line\ Breaks=false
+Ignore\ Style\ Attribute\ in\ <DIV>\ Tags=true
 Ignore\ <NOSCRIPT>\ Tags=true
 Add\ removed\ links\ to\ bottom\ of\ the\ page=false
-Minimum\ text\ length\ as\ substance=1
+Minimum\ text\ length\ as\ substance=12
 Remove\ Empty\ Tables=true
 Ignore\ Text\ Links\ in\ Link\ Lists=true
-Ignore\ Table\ Cell\ Widths=false
-Ignore\ Style\ Attributes=false
+Ignore\ Table\ Cell\ Widths=true
+Ignore\ Style\ Attributes=true
 Ignore\ <SELECT>\ Tags=true
-Ignore\ Link\ Lists=true
 Link/Text\ Removal\ Ratio=0.25
 Ignore\ Images=true
+Ignore\ Link\ Lists=true
 Print\ Only\ Text=false
diff --git a/src/psl/crunch3/Crunch3Settings.java b/src/psl/crunch3/Crunch3Settings.java
index 89a93cf..92f5f83 100644
--- a/src/psl/crunch3/Crunch3Settings.java
+++ b/src/psl/crunch3/Crunch3Settings.java
@@ -27,7 +27,7 @@ public class Crunch3Settings {
 	public static final String[] FILTER_TYPES_DEF = { "text/html" };
 	public static final boolean FILTER_HOMEPAGES_DEF = true;
 	public static final boolean PROXY_MODE_DEF = true;
-	public static String SETTINGS_FILE = "config/level2.ini";
+	public static String SETTINGS_FILE = "config/level4.ini";
 	public static final boolean CHECK_HOMEPAGE_DEF = false;

 	//internal variables
diff --git a/src/psl/crunch3/MainWindow.java b/src/psl/crunch3/MainWindow.java
index b58c724..108c835 100644
--- a/src/psl/crunch3/MainWindow.java
+++ b/src/psl/crunch3/MainWindow.java
@@ -1166,12 +1166,11 @@ public class MainWindow extends Thread {
 		Crunch3.Display_1.syncExec(new Runnable(){
 			public void run(){
 				if(gui != null){
-					gui.commitSettings("config" + File.separator + "level2.ini", 2);
-					gui.setSettingsLevel(2);
+					gui.commitSettings("config" + File.separator + "level4.ini", 4);
+					gui.setSettingsLevel(4);
 				}
 			}
 		});
-
 	}

 	private String[] split(final String text, final String delimiters) {
diff --git a/src/psl/crunch3/plugins/contentextractor/ContentExtractor.java b/src/psl/crunch3/plugins/contentextractor/ContentExtractor.java
index 6b44596..dc78d62 100644
--- a/src/psl/crunch3/plugins/contentextractor/ContentExtractor.java
+++ b/src/psl/crunch3/plugins/contentextractor/ContentExtractor.java
@@ -95,6 +95,7 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende
 	private boolean detectRandomSurfing = false;
 	private String linkToAppend = null;
 	private String currentAddress = null;
+	private int counter=1; //counts the number of pages to append

 	/**
 	 * Creates a new instance without any input stream and the default settings file.
@@ -148,10 +149,7 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende
 			mTree = parser.getDocument();
 			extractContent(mTree);

-
-
 		}
-
 		catch (Exception e) {
 			e.printStackTrace();
 		}
@@ -161,16 +159,17 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende
 		if (child) {
 			if (mTree == null)
 				mTree = (Document) iNode;
+			counter = counter + 1;
 			extract(iNode, mTree);

-
 			org.cyberneko.html.parsers.DOMParser parser = new org.cyberneko.html.parsers.DOMParser();
 			String address = null;
 			URL site =null;
 			Document newTree;
 			InputStream in;
 			try{
-				/**if((linkToAppend !=null) && (linkToAppend != address)){
+				while((linkToAppend !=null) && (linkToAppend != address)){
+

 					System.out.println("*** " + linkToAppend);
 					address = linkToAppend;
@@ -184,11 +183,18 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende

 							if(currentAddress == null)
 								currentAddress = Crunch3.mainWindow.getURL();
+								String temp = currentAddress.substring(7);
+
+								int index = temp.indexOf("/");

-								site = new URL (currentAddress + address);
+								String first = currentAddress.substring(0,index+7);
+
+								if(!(first.endsWith("/")) && !(address.startsWith("/"))) first = first + "/";
+								site = new URL (first + address);


 						}
+
 						catch(Exception e){
 							e.printStackTrace();
 							//break;
@@ -200,13 +206,19 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende
 					parser.parse(new InputSource(reader));
 					newTree = parser.getDocument();
 					linkToAppend = null;
+
 					extract(newTree,newTree);
-					if(linkToAppend != null) System.out.println("*** " + linkToAppend);
-					//prettyPrint(newTree, System.out);
+					if(linkToAppend != null) System.out.println("***## " + linkToAppend);
+
+					//prettyPrint(newTree, System.out);
+					System.out.println((mTree.getFirstChild()).getNodeName() + "*************");
 					appendDocument(newTree, mTree);
-					//prettyPrint(mTree, System.out);

-				}**/
+
+				}
+			}
+			catch(FileNotFoundException fe){
+
 			}
 			catch(Exception e){
 				e.printStackTrace();
@@ -952,9 +964,9 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende
 	private String getNextLink(final Node iNode){
 		Node temp = iNode.getFirstChild();
 		if((temp != null) && (temp.getNodeType() == Node.TEXT_NODE)){
-
+			String num = Integer.toString(counter);
 			String text = ((temp.getNodeValue()).trim()).toLowerCase();
-			if (text.startsWith("next")){
+			if ((text.startsWith("next")) || (text.trim()).startsWith(num)){
 				String s = ((Element)iNode).getAttribute("href");
 				return s;
 			}
@@ -1307,39 +1319,47 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende

 	private void appendDocument(Document from, Document to){

-		//Node newNode = to.importNode(from.getDocumentElement(), true);
-		Node fromCurrent = (from.getFirstChild()).getFirstChild();
-		Node toCurrent;
-		while(fromCurrent != null){
-			System.out.println(fromCurrent.getNodeName());
-			if ((fromCurrent.getNodeName()).equals("BODY")){
-
-				toCurrent = (to.getDocumentElement()).getFirstChild();
-
-				System.out.println("before loop" + toCurrent.getNodeName());
-				while(toCurrent != null){
-					if ((toCurrent.getNodeName()).equals("BODY")){
-
-						//append all children of fromCurrent to toCurrent
-						fromCurrent = fromCurrent.getFirstChild();
-						while (fromCurrent !=null){
-							toCurrent.appendChild(to.importNode(fromCurrent, true));
-							fromCurrent = fromCurrent.getNextSibling();
-						}
-						toCurrent = null;
-					}
-					else{
-						System.out.println(toCurrent.getNodeName());
-						toCurrent = toCurrent.getNextSibling();
-					}
-				}
-
-				fromCurrent = null;
+		Node fromBody = ((from.getFirstChild()).getFirstChild());
+
+		System.out.println(from.getNodeName());
+
+		while (!((fromBody.getNodeName()).equals("BODY")) && !(fromBody == null)){
+			fromBody = fromBody.getNextSibling();
+		}
+
+
+		from.removeChild(from.getFirstChild());
+
+		Node toBody = ((to.getDocumentElement()).getFirstChild());
+
+
+		while (!((toBody.getNodeName()).equals("BODY")) && !(toBody == null)){
+			toBody = toBody.getNextSibling();
+		}
+
+		append(to,fromBody, toBody);
+
+
+	}
+
+
+
+	private void append(Document doc, Node n, Node to){
+
+		NodeList children = n.getChildNodes();
+		Node newNode;
+		for (int i=0; i<children.getLength();i++){
+			try{
+				newNode = doc.importNode(children.item(i),false);
+				to.appendChild(newNode);
+				append(doc,children.item(i), newNode);
+			}
+			catch(Exception e){
+				System.out.println(children.item(i));
 			}
-			else fromCurrent = fromCurrent.getNextSibling();
+

 		}
-
 	}


@@ -1757,6 +1777,7 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende
 			else System.out.println("This is not a Homepage");
 		}

+		//handles frontpage detection for GUI-less crunch
 		if(descriptionGUI == null) {

 			if(Crunch3.settings.isHomePageCheck()){
@@ -1769,8 +1790,10 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende
 			}
 			return;
 		}
+
+		//compute closes cluster to current URL and apply appropriate filter settings
 		if (descriptionGUI.isAuto()){
-			//check what cluster the site belongs to and determine the correct filter.
+
 			int cluster = 0;

 			//check if the site is already clustered.
@@ -1778,29 +1801,28 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende
 				System.out.println(URL +" is already clustered");
 			}
 			else{
-
+				//create a new wordcount object that is used to generate the word-frequency map for the current site
 				WordCount wc = new WordCount(URL.substring(7), descriptionGUI.getFrequencies(),
 						descriptionGUI.getKeys(), descriptionGUI.getSites(), descriptionGUI.getEngineNumber());
+
+				//get the site closest to URL from preclustered list
 				String closest = wc.getClosestSite();
 				if(closest !=null)
 					cluster = descriptionGUI.getCluster(closest);
 				else cluster = 0;
-
-				System.out.println(cluster);
-
-
 			}

+			//keep track of visited clusters for random surfing detection
 		    visitedClusters.addElement(new Integer(cluster));
 		    if(visitedClusters.size()>3){
 		    	visitedClusters.removeElementAt(0);
 		    }

-		    //this method should be changed after each run of WordCount.
+		    //apply filter settings based on the cluster number
 		    applySettings(cluster);


-			//if the page is a news home page change the setting to level 6
+			//if the page is a news home page relax the settings
 			if(hpt.isHomePage()){
 				int level = descriptionGUI.getSettingLevel();
 				if(level ==2){
@@ -1820,12 +1842,14 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende


 		}
+
+		//if settings aren't automatic, just apply regular front-page detection proc
 		else{
 			if(descriptionGUI.checkFrontPage() && hpt.isHomePage()){

-				if((descriptionGUI.getSettingsLabel()).equals("shopping")){
-					descriptionGUI.commitSettings("config" + File.separator + "level9.ini", 9);
-					descriptionGUI.setSettingsLevel(9);
+				if((descriptionGUI.getSettingsLabel()).equals("news")){
+					descriptionGUI.commitSettings("config" + File.separator + "level6.ini", 6);
+					descriptionGUI.setSettingsLevel(6);
 				}


@@ -1839,14 +1863,24 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende
 		}
 	}

+
 	private void relax(){
 		descriptionGUI.commitSettings("config" + File.separator + "level" + 9 + ".ini", 9);
 	}

+
+	/**
+	 * Selects the custom button and deselects any other options in the description GUI
+	 */
 	public void selectCustom(){
 		descriptionGUI.selectCustom();
 	}

+
+	/**
+	 * Apply the correct filter settings determined manually, given the cluster number.
+	 * @param cluster
+	 */
 	private void applySettings(int cluster){
 		switch(cluster){

@@ -1899,6 +1933,12 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende
 		descriptionGUI.updateSettingsLevel();
 	}

+
+	/**
+	 * If the last three sites visited are different from each other,
+	 * switch to randon surfing mode
+	 * @return
+	 */
 	private boolean isRandomSurfing(){
 		int current = ((Integer)(visitedClusters.elementAt(2))).intValue();
 		int prev1 = ((Integer)(visitedClusters.elementAt(1))).intValue();