*** empty log message ***

hb2143 [2005-06-17 21:08:37]
*** empty log message ***
Filename
src/psl/crunch3/plugins/contentextractor/ContentExtractor.java
src/psl/crunch3/plugins/contentextractor/ContentExtractorDescriptionGUI.java
src/psl/crunch3/util/TreeDistane.java
diff --git a/src/psl/crunch3/plugins/contentextractor/ContentExtractor.java b/src/psl/crunch3/plugins/contentextractor/ContentExtractor.java
index dc78d62..2a3ffac 100644
--- a/src/psl/crunch3/plugins/contentextractor/ContentExtractor.java
+++ b/src/psl/crunch3/plugins/contentextractor/ContentExtractor.java
@@ -167,8 +167,10 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende
 			URL site =null;
 			Document newTree;
 			InputStream in;
+
+
 			try{
-				while((linkToAppend !=null) && (linkToAppend != address)){
+				while((linkToAppend !=null) && (linkToAppend != address) ){


 					System.out.println("*** " + linkToAppend);
@@ -207,7 +209,9 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende
 					newTree = parser.getDocument();
 					linkToAppend = null;

+					counter++;
 					extract(newTree,newTree);
+					System.out.println(linkToAppend + ",,,,,,,,,,");
 					if(linkToAppend != null) System.out.println("***## " + linkToAppend);

 					//prettyPrint(newTree, System.out);
@@ -1796,6 +1800,8 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende

 			int cluster = 0;

+			System.out.println("******************************* "+ (WordCount.parseURL(URL.substring(7),true)));
+
 			//check if the site is already clustered.
 			if((cluster = descriptionGUI.getCluster(WordCount.parseURL(URL.substring(7),true))) != 0){
 				System.out.println(URL +" is already clustered");
diff --git a/src/psl/crunch3/plugins/contentextractor/ContentExtractorDescriptionGUI.java b/src/psl/crunch3/plugins/contentextractor/ContentExtractorDescriptionGUI.java
index 496cd26..1310e84 100644
--- a/src/psl/crunch3/plugins/contentextractor/ContentExtractorDescriptionGUI.java
+++ b/src/psl/crunch3/plugins/contentextractor/ContentExtractorDescriptionGUI.java
@@ -79,7 +79,7 @@ public class ContentExtractorDescriptionGUI {
 	private Label relaxLabel = null;
 	private Label toughenLabel = null;
 	private Button frontPageCheck = null;
-
+	private Button nextPageCheck = null;

 	private ContentExtractorSettings newFilter = ContentExtractorSettings.getInstance();
 	private boolean isAuto = false;
@@ -90,6 +90,7 @@ public class ContentExtractorDescriptionGUI {
 	private int engineNumber = 5;
 	private int settingLevel = 0;
 	private boolean frontPage = true;
+	private boolean nextPage = true;
 	private String settingsLabel;

 	/**
@@ -151,7 +152,7 @@ public class ContentExtractorDescriptionGUI {
 		infoButton = new Button(userSettingsGroup, SWT.RADIO);
 		pdaButton = new Button(userSettingsGroup, SWT.RADIO);
 		frontPageCheck = new Button(userSettingsGroup, SWT.CHECK);
-
+		nextPageCheck = new Button(userSettingsGroup, SWT.CHECK);

 		ContentPluginSeparator2 = new Sash(mainComposite, SWT.HORIZONTAL | SWT.BORDER);
 		automaticGroup = new Group(mainComposite, SWT.NULL);
@@ -301,6 +302,9 @@ public class ContentExtractorDescriptionGUI {
 		impairedButton.setText("Visually Impaired");
 		frontPageCheck.setText("Detect Front Page");
 		frontPageCheck.setSelection(true);
+		nextPageCheck.setText("Append Next Page");
+		nextPageCheck.setSelection(true);
+

 		ContentPluginSeparator2.setLayoutData(ContentSeparator2);
 		specificImageLabel.setLayoutData(specificImageLabelGridData);
@@ -340,37 +344,52 @@ public class ContentExtractorDescriptionGUI {

 		// START EVENT_INITIALIZATION
 		newsButton.addSelectionListener(new SelectionAdapter() {
+
 			public void widgetSelected(SelectionEvent e){
+				frontPageCheck.setSelection(true);
+		 		frontPage = true;
 				newsButton_widgetSelected(e);
 			}
 		});
 		shoppingButton.addSelectionListener(new SelectionAdapter() {
 			public void widgetSelected(SelectionEvent e){
+				frontPageCheck.setSelection(true);
+		 		frontPage = true;
 				shoppingButton_widgetSelected(e);
 			}
 		});
 		governmentButton.addSelectionListener(new SelectionAdapter() {
 			public void widgetSelected(SelectionEvent e){
+				frontPageCheck.setSelection(true);
+		 		frontPage = true;
 				governmentButton_widgetSelected(e);
 			}
 		});
 		educationButton.addSelectionListener(new SelectionAdapter() {
 			public void widgetSelected(SelectionEvent e){
+				frontPageCheck.setSelection(true);
+		 		frontPage = true;
 				educationButton_widgetSelected(e);
 			}
 		});
 		textHeavyButton.addSelectionListener(new SelectionAdapter() {
 			public void widgetSelected(SelectionEvent e){
+				frontPageCheck.setSelection(true);
+		 		frontPage = true;
 				textHeavyButton_widgetSelected(e);
 			}
 		});
 		linkHeavyButton.addSelectionListener(new SelectionAdapter() {
 			public void widgetSelected(SelectionEvent e){
+				frontPageCheck.setSelection(true);
+		 		frontPage = true;
 				linkHeavyButton_widgetSelected(e);
 			}
 		});
 		autoButton.addSelectionListener(new SelectionAdapter() {
 			public void widgetSelected(SelectionEvent e){
+				frontPageCheck.setSelection(true);
+		 		frontPage = true;
 				auto_widgetSelected(e);
 			}
 		});
@@ -437,6 +456,18 @@ public class ContentExtractorDescriptionGUI {
 			}
 		});

+		nextPageCheck.addSelectionListener(new SelectionAdapter(){
+
+			public void widgetSelected(SelectionEvent e){
+				if (nextPageCheck.getSelection() == true){
+					nextPage = true;
+				}
+				else{
+					nextPage = false;
+				}
+			}
+		});
+
 		// TODO
 		if (ContentExtractor.customLast){
 			customButton.setSelection(true);
@@ -772,6 +803,10 @@ public class ContentExtractorDescriptionGUI {
     	return frontPage;
     }

+    public boolean checkNextPage(){
+    	return nextPage;
+    }
+
     public String getSettingsLabel(){
     	return settingsLabel;
     }
diff --git a/src/psl/crunch3/util/TreeDistane.java b/src/psl/crunch3/util/TreeDistane.java
new file mode 100644
index 0000000..0541209
--- /dev/null
+++ b/src/psl/crunch3/util/TreeDistane.java
@@ -0,0 +1,104 @@
+/*
+ * Created on Jun 8, 2005
+ *
+ * TODO To change the template for this generated file go to
+ * Window - Preferences - Java - Code Style - Code Templates
+ */
+package psl.crunch3.util;
+
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URL;
+import org.w3c.dom.*;
+import org.xml.sax.InputSource;
+
+/**
+ * @author hb2143
+ *
+ * TODO To change the template for this generated type comment go to
+ * Window - Preferences - Java - Code Style - Code Templates
+ */
+public class TreeDistane {
+
+	public static void main(String[] args) {
+
+		Document tree1, tree2;
+		String site1,site2;
+		site1 = args[0];
+		site2 = args[1];
+
+
+
+		//generate DOM trees for both sites
+		tree1 = getDOMTree(site1);
+
+		tree2 = getDOMTree(site2);
+
+
+		//measure distance by comparing the trees, node by node
+
+
+
+
+	}
+
+	private static Document getDOMTree(String site){
+
+		org.cyberneko.html.parsers.DOMParser parser = new org.cyberneko.html.parsers.DOMParser();
+		try{
+			URL url = new URL(site);
+
+			InputStream in = url.openStream();
+			InputStreamReader reader = new InputStreamReader(in,"ISO-8859-1");
+			parser.parse(new InputSource(reader));
+			return parser.getDocument();
+
+		}
+		catch(Exception e){
+			System.out.println(e.getClass());
+			e.printStackTrace();
+			return null;
+		}
+
+	}
+
+	/*
+	 * compare nodes a and b and recursively compare their children
+	 * if a != b increase the distance measure by 1.
+	 * returns the total distance
+	 */
+	private int getDistanceEasy(Node a, Node b){
+
+		int counter = 0;
+		if(!(a.equals(b))){
+			counter ++;
+		}
+
+		NodeList aChildren = a.getChildNodes();
+		NodeList bChildren = b.getChildNodes();
+
+		int numChildren;
+
+		if (aChildren.getLength() < bChildren.getLength()){
+			numChildren = aChildren.getLength();
+			counter += (bChildren.getLength() - numChildren);
+		}
+		else{
+			numChildren = bChildren.getLength();
+			counter += (aChildren.getLength() - numChildren);
+		}
+
+		for(int i=0;i<numChildren;i++){
+
+			counter += getDistanceEasy(aChildren.item(i), bChildren.item(i));
+
+		}
+
+		return counter;
+
+	}
+
+
+
+
+}