ok, this is what i have so far.

hb2143 [2005-02-15 18:00:55]
ok, this is what i have so far.
this is by no means done but i figured you might want to check it out in case it is sufficient for a demo
Filename
clusters.txt
config/level1.ini
config/level10.ini
config/level11.ini
config/level12.ini
config/level2.ini
config/level3.ini
config/level4.ini
config/level5.ini
config/level6.ini
config/level7.ini
config/level8.ini
config/level9.ini
src/psl/crunch3/plugins/contentextractor/ContentExtractor.java
src/psl/crunch3/plugins/contentextractor/ContentExtractorDescriptionGUI.java
src/psl/crunch3/util/WordCount.java
diff --git a/clusters.txt b/clusters.txt
index 727fa78..5ef30a2 100644
--- a/clusters.txt
+++ b/clusters.txt
@@ -1,153 +1,149 @@
-the-times+co 1
-timesonline+co 1
-indiaabroad 2
-rediff 2
-hindustantimes 2
-timesofindia+indiatimes 2
-pricewatch 3
-streetprices 3
-pricegrabber 3
-nextag 3
-apple 3
-auctions+yahoo 3
-resellerratings 3
-bhphotovideo 3
-overstock 3
-buy 3
-newegg 3
-tomshardware 3
-BizRate 4
-Shopzilla 4
-epinions 4
-shopping 4
-fatwallet 4
-shopping+yahoo 4
-news+bbc+co 5
-bbc+co 5
-news+yahoo 5
-news+com 5
-expressindia 6
-indianexpress 6
-observerindia 6
-spaceflightnews 7
-spacetoday 7
-universetoday 7
-spaceDaily 7
-rednova 7
-spaceref 7
-space 7
-spaceflightnow 7
-hp 8
-ibm 8
-dell 8
-infoworld 8
-iht 9
-observer 9
-newyorkpost 9
-cnn 9
-ny1 9
-drudgereport 9
-news+ft 9
-washingtontimes 9
-cbc 9
-hinduonnet 9
-timesunion 9
-suntimes 9
-stltoday 9
-sports+yahoo 9
-ap 9
-theunionleader 9
-themoscowtimes 9
-globalsecurity 9
-allsports 9
-sfgate 9
-csmonitor 9
-thisislondon+co 9
-pressherald+mainetoday 9
-wired 9
-usatoday 9
-jpost 9
-smh+com 9
-fark 9
-latimes 9
-tech+nytimes 9
-cleveland 9
-pcworld 9
-salon 9
-thesun+co 9
-sportingnews 9
-nytimes 9
-japantoday 9
-dallasnews 9
-economist 9
-gertzfile 9
-nydailynews 9
-sun-sentinel 9
-cbs+sportsline 9
-ndtv 9
-abqjournal 9
-tnr 10
-tomalak 10
-heroicstories 10
-marsrovers+jpl+nasa 10
-prosportsdaily 10
-kuro5hin 10
-catless+ncl+ac 10
-outlookindia 10
-techreview 10
-digitalidworld 10
-slate+msn 10
-time 10
-joystiq 10
-groklaw 10
-slashdot 10
-blogs+msdn 10
-fuckedcompany 10
-engadget 10
-mobilewhack 10
-counterpunch 10
-connection 11
-warehouse 11
-tesco 11
-eonline 11
-target 11
-ebay 11
-walmart 11
-zones 11
-insight 11
-amazon 11
-computerworld 11
-anandtech 12
-theinquirer 12
-theregister 12
-techjapan 12
-arstechnica 12
-mtekk+com 12
-infosyncworld 12
-SpotStop 12
-worldtribune 12
-pocketpcthoughts 12
-dpreview 12
-eHomeUpgrade 12
-ppcw 12
-hardocp 12
-mobiletracker 12
-eweek 12
-photographyblog 12
-defamer 13
-gawker 13
-wonkette 13
-gizmodo 13
-azcentral 14
-chicagotribune 14
-tampatrib 14
-thestate 14
-baltimoresun 14
-orlandosentinel 14
-tallahassee 14
-charlotte 14
-miami 14
-ajc 14
-indystar 14
-digitalmediathoughts 15
-smartphonethoughts 15
+the-times+co 1 0
+timesonline+co 1 0
+pricewatch 2 0
+streetprices 2 0
+epinions 2 1
+pricegrabber 2 1
+BizRate 2 2
+nextag 2 2
+Shopzilla 2 2
+shopping 2 2
+tesco 2 2
+anandtech 2 2
+fatwallet 2 2
+shopping+yahoo 2 2
+auctions+yahoo 2 1
+resellerratings 2 1
+ebay 2 2
+SpotStop 2 1
+overstock 2 2
+bhphotovideo 2 1
+buy 2 1
+amazon 2 2
+tomshardware 2 1
+newegg 2 2
+indiaabroad 3 0
+rediff 3 0
+outlookindia 3 1
+hindustantimes 3 2
+thesun+co 3 1
+timesofindia+indiatimes 3 1
+ndtv 3 1
+expressindia 4 0
+indianexpress 4 0
+observerindia 4 1
+news+bbc+co 5 0
+bbc+co 5 0
+news+yahoo 5 1
+news+ft 5 1
+cnn 5 1
+ap 5 1
+cbc 5 2
+suntimes 5 2
+sports+yahoo 5 1
+allsports 5 2
+news+com 5 1
+wired 5 1
+usatoday 5 2
+pcworld 5 2
+economist 5 2
+sportingnews 5 2
+sun-sentinel 5 2
+cbs+sportsline 5 2
+hp 6 0
+ibm 6 0
+apple 6 1
+dell 6 2
+target 6 2
+walmart 6 2
+iht 7 0
+tomalak 7 0
+tnr 7 1
+prosportsdaily 7 1
+hinduonnet 7 1
+washingtontimes 7 1
+drudgereport 7 1
+theregister 7 1
+marsrovers+jpl+nasa 7 1
+heroicstories 7 1
+techreview 7 2
+csmonitor 7 1
+themoscowtimes 7 1
+kuro5hin 7 1
+catless+ncl+ac 7 1
+techjapan 7 1
+arstechnica 7 2
+theinquirer 7 1
+slate+msn 7 2
+globalsecurity 7 2
+pressherald+mainetoday 7 1
+worldtribune 7 1
+digitalidworld 7 2
+groklaw 7 2
+japantoday 7 1
+fark 7 2
+hardocp 7 2
+blogs+msdn 7 2
+space 7 2
+fuckedcompany 7 1
+gertzfile 7 2
+researchbuzz 7 2
+slashdot 7 2
+mobilewhack 7 2
+connection 8 0
+warehouse 8 0
+zones 8 1
+insight 8 1
+time 8 2
+computerworld 8 2
+infoworld 8 1
+eweek 8 2
+newyorkpost 9 0
+ny1 9 0
+stltoday 9 1
+observer 9 1
+timesunion 9 1
+chicagotribune 9 1
+orlandosentinel 9 2
+eonline 9 1
+theunionleader 9 2
+azcentral 9 2
+tampatrib 9 2
+latimes 9 2
+smh+com 9 2
+thisislondon+co 9 1
+tech+nytimes 9 1
+sfgate 9 1
+thestate 9 2
+jpost 9 1
+cleveland 9 2
+baltimoresun 9 2
+salon 9 1
+nytimes 9 1
+dallasnews 9 2
+nydailynews 9 1
+indystar 9 2
+ajc 9 2
+spaceflightnews 10 0
+spacetoday 10 0
+universetoday 10 1
+spaceDaily 10 1
+rednova 10 1
+spaceflightnow 10 1
+spaceref 10 1
+infosyncworld 11 0
+mtekk+com 11 0
+ppcw 11 1
+dpreview 11 1
+mobiletracker 11 1
+defamer 12 0
+gawker 12 0
+wonkette 12 1
+gizmodo 12 1
+pocketpcthoughts 13 0
+smartphonethoughts 13 0
+digitalmediathoughts 13 1
+eHomeUpgrade 13 2
+charlotte 14 0
+tallahassee 14 0
+miami 14 1
diff --git a/config/level1.ini b/config/level1.ini
new file mode 100644
index 0000000..d28df31
--- /dev/null
+++ b/config/level1.ini
@@ -0,0 +1,42 @@
+#Content Extractor Settings File
+#Thu Feb 10 18:38:10 EST 2005
+<A>\ tags\ are\ substance=true
+<FORM>\ tags\ are\ substance=true
+Ignore\ Image\ Links=true
+Ignore\ Meta\ Tags=true
+Display\ Image\ Link\ ALTs=false
+Ignore\ Text\ Links=true
+<INPUT>\ tags\ are\ substance=true
+Ignore\ <IFRAME>\ Tags=true
+<BUTTON>\ tags\ are\ substance=true
+Ignore\ All\ Advertisements=true
+<SELECT>\ tags\ are\ substance=true
+Ignore\ External\ Stylesheets=true
+<IFRAME>\ tags\ are\ substance=true
+Ignore\ <EMBED>\ tags=true
+Ignore\ Styles=true
+Ignore\ Only\ Links\ and\ Text\ in\ Link\ Lists=true
+Ignore\ <INPUT>\ Tags=true
+Ignore\ Flash=true
+Ignore\ Image\ Links\ in\ Link\ Lists=true
+<IMG>\ tags\ are\ substance=true
+<TEXTAREA>\ tags\ are\ substance=true
+Ignore\ Scripts=true
+Ignore\ Forms=true
+Display\ Image\ ALTs=false
+Maximum\ Number\ of\ Line\ Breaks=2
+Ignore\ <BUTTON>\ Tags=true
+Limit\ Number\ of\ Line\ Breaks=false
+Ignore\ Style\ Attribute\ in\ <DIV>\ Tags=true
+Ignore\ <NOSCRIPT>\ Tags=true
+Add\ removed\ links\ to\ bottom\ of\ the\ page=false
+Minimum\ text\ length\ as\ substance=12
+Remove\ Empty\ Tables=true
+Ignore\ Text\ Links\ in\ Link\ Lists=true
+Ignore\ Table\ Cell\ Widths=true
+Ignore\ Style\ Attributes=true
+Ignore\ <SELECT>\ Tags=true
+Ignore\ Link\ Lists=true
+Link/Text\ Removal\ Ratio=0.25
+Ignore\ Images=true
+Print\ Only\ Text=false
diff --git a/config/level10.ini b/config/level10.ini
new file mode 100644
index 0000000..b3c9227
--- /dev/null
+++ b/config/level10.ini
@@ -0,0 +1,42 @@
+#Content Extractor Settings File
+#Thu Feb 10 18:59:33 EST 2005
+<A>\ tags\ are\ substance=true
+<FORM>\ tags\ are\ substance=true
+Ignore\ Image\ Links=false
+Ignore\ Meta\ Tags=true
+Display\ Image\ Link\ ALTs=true
+Ignore\ Text\ Links=false
+<INPUT>\ tags\ are\ substance=true
+Ignore\ <IFRAME>\ Tags=false
+<BUTTON>\ tags\ are\ substance=true
+Ignore\ All\ Advertisements=true
+<SELECT>\ tags\ are\ substance=true
+Ignore\ External\ Stylesheets=false
+<IFRAME>\ tags\ are\ substance=true
+Ignore\ <EMBED>\ tags=false
+Ignore\ Styles=false
+Ignore\ Only\ Links\ and\ Text\ in\ Link\ Lists=true
+Ignore\ <INPUT>\ Tags=false
+Ignore\ Flash=true
+Ignore\ Image\ Links\ in\ Link\ Lists=true
+<IMG>\ tags\ are\ substance=true
+<TEXTAREA>\ tags\ are\ substance=true
+Ignore\ Scripts=true
+Ignore\ Forms=false
+Display\ Image\ ALTs=true
+Maximum\ Number\ of\ Line\ Breaks=2
+Ignore\ <BUTTON>\ Tags=false
+Limit\ Number\ of\ Line\ Breaks=false
+Ignore\ Style\ Attribute\ in\ <DIV>\ Tags=false
+Ignore\ <NOSCRIPT>\ Tags=true
+Add\ removed\ links\ to\ bottom\ of\ the\ page=false
+Minimum\ text\ length\ as\ substance=12
+Remove\ Empty\ Tables=false
+Ignore\ Text\ Links\ in\ Link\ Lists=true
+Ignore\ Table\ Cell\ Widths=true
+Ignore\ Style\ Attributes=false
+Ignore\ <SELECT>\ Tags=false
+Ignore\ Link\ Lists=false
+Link/Text\ Removal\ Ratio=0.75
+Ignore\ Images=false
+Print\ Only\ Text=false
diff --git a/config/level11.ini b/config/level11.ini
new file mode 100644
index 0000000..eace12b
--- /dev/null
+++ b/config/level11.ini
@@ -0,0 +1,42 @@
+#Content Extractor Settings File
+#Thu Feb 10 18:59:59 EST 2005
+<A>\ tags\ are\ substance=true
+<FORM>\ tags\ are\ substance=true
+Ignore\ Image\ Links=false
+Ignore\ Meta\ Tags=false
+Display\ Image\ Link\ ALTs=true
+Ignore\ Text\ Links=false
+<INPUT>\ tags\ are\ substance=true
+Ignore\ <IFRAME>\ Tags=false
+<BUTTON>\ tags\ are\ substance=true
+Ignore\ All\ Advertisements=true
+<SELECT>\ tags\ are\ substance=true
+Ignore\ External\ Stylesheets=false
+<IFRAME>\ tags\ are\ substance=true
+Ignore\ <EMBED>\ tags=false
+Ignore\ Styles=false
+Ignore\ Only\ Links\ and\ Text\ in\ Link\ Lists=true
+Ignore\ <INPUT>\ Tags=false
+Ignore\ Flash=false
+Ignore\ Image\ Links\ in\ Link\ Lists=true
+<IMG>\ tags\ are\ substance=true
+<TEXTAREA>\ tags\ are\ substance=true
+Ignore\ Scripts=false
+Ignore\ Forms=false
+Display\ Image\ ALTs=true
+Maximum\ Number\ of\ Line\ Breaks=2
+Ignore\ <BUTTON>\ Tags=false
+Limit\ Number\ of\ Line\ Breaks=false
+Ignore\ Style\ Attribute\ in\ <DIV>\ Tags=false
+Ignore\ <NOSCRIPT>\ Tags=true
+Add\ removed\ links\ to\ bottom\ of\ the\ page=false
+Minimum\ text\ length\ as\ substance=12
+Remove\ Empty\ Tables=false
+Ignore\ Text\ Links\ in\ Link\ Lists=true
+Ignore\ Table\ Cell\ Widths=false
+Ignore\ Style\ Attributes=false
+Ignore\ <SELECT>\ Tags=false
+Ignore\ Link\ Lists=false
+Link/Text\ Removal\ Ratio=0.75
+Ignore\ Images=false
+Print\ Only\ Text=false
diff --git a/config/level12.ini b/config/level12.ini
new file mode 100644
index 0000000..0691794
--- /dev/null
+++ b/config/level12.ini
@@ -0,0 +1,42 @@
+#Content Extractor Settings File
+#Thu Feb 10 19:00:17 EST 2005
+<A>\ tags\ are\ substance=true
+<FORM>\ tags\ are\ substance=true
+Ignore\ Image\ Links=false
+Ignore\ Meta\ Tags=false
+Display\ Image\ Link\ ALTs=true
+Ignore\ Text\ Links=false
+<INPUT>\ tags\ are\ substance=true
+Ignore\ <IFRAME>\ Tags=false
+<BUTTON>\ tags\ are\ substance=true
+Ignore\ All\ Advertisements=false
+<SELECT>\ tags\ are\ substance=true
+Ignore\ External\ Stylesheets=false
+<IFRAME>\ tags\ are\ substance=true
+Ignore\ <EMBED>\ tags=false
+Ignore\ Styles=false
+Ignore\ Only\ Links\ and\ Text\ in\ Link\ Lists=true
+Ignore\ <INPUT>\ Tags=false
+Ignore\ Flash=false
+Ignore\ Image\ Links\ in\ Link\ Lists=true
+<IMG>\ tags\ are\ substance=true
+<TEXTAREA>\ tags\ are\ substance=true
+Ignore\ Scripts=false
+Ignore\ Forms=false
+Display\ Image\ ALTs=true
+Maximum\ Number\ of\ Line\ Breaks=2
+Ignore\ <BUTTON>\ Tags=false
+Limit\ Number\ of\ Line\ Breaks=false
+Ignore\ Style\ Attribute\ in\ <DIV>\ Tags=false
+Ignore\ <NOSCRIPT>\ Tags=true
+Add\ removed\ links\ to\ bottom\ of\ the\ page=false
+Minimum\ text\ length\ as\ substance=12
+Remove\ Empty\ Tables=false
+Ignore\ Text\ Links\ in\ Link\ Lists=true
+Ignore\ Table\ Cell\ Widths=false
+Ignore\ Style\ Attributes=false
+Ignore\ <SELECT>\ Tags=false
+Ignore\ Link\ Lists=false
+Link/Text\ Removal\ Ratio=0.75
+Ignore\ Images=false
+Print\ Only\ Text=false
diff --git a/config/level2.ini b/config/level2.ini
new file mode 100644
index 0000000..a221eaf
--- /dev/null
+++ b/config/level2.ini
@@ -0,0 +1,42 @@
+#Content Extractor Settings File
+#Thu Feb 10 18:49:45 EST 2005
+<A>\ tags\ are\ substance=true
+<FORM>\ tags\ are\ substance=true
+Ignore\ Image\ Links=true
+Ignore\ Meta\ Tags=true
+Display\ Image\ Link\ ALTs=true
+Ignore\ Text\ Links=true
+<INPUT>\ tags\ are\ substance=true
+Ignore\ <IFRAME>\ Tags=true
+<BUTTON>\ tags\ are\ substance=true
+Ignore\ All\ Advertisements=true
+<SELECT>\ tags\ are\ substance=true
+Ignore\ External\ Stylesheets=true
+<IFRAME>\ tags\ are\ substance=true
+Ignore\ <EMBED>\ tags=true
+Ignore\ Styles=true
+Ignore\ Only\ Links\ and\ Text\ in\ Link\ Lists=true
+Ignore\ <INPUT>\ Tags=true
+Ignore\ Flash=true
+Ignore\ Image\ Links\ in\ Link\ Lists=true
+<IMG>\ tags\ are\ substance=true
+<TEXTAREA>\ tags\ are\ substance=true
+Ignore\ Scripts=true
+Ignore\ Forms=true
+Display\ Image\ ALTs=true
+Maximum\ Number\ of\ Line\ Breaks=2
+Ignore\ <BUTTON>\ Tags=true
+Limit\ Number\ of\ Line\ Breaks=false
+Ignore\ Style\ Attribute\ in\ <DIV>\ Tags=true
+Ignore\ <NOSCRIPT>\ Tags=true
+Add\ removed\ links\ to\ bottom\ of\ the\ page=false
+Minimum\ text\ length\ as\ substance=12
+Remove\ Empty\ Tables=true
+Ignore\ Text\ Links\ in\ Link\ Lists=true
+Ignore\ Table\ Cell\ Widths=true
+Ignore\ Style\ Attributes=true
+Ignore\ <SELECT>\ Tags=true
+Ignore\ Link\ Lists=true
+Link/Text\ Removal\ Ratio=0.25
+Ignore\ Images=true
+Print\ Only\ Text=false
diff --git a/config/level3.ini b/config/level3.ini
new file mode 100644
index 0000000..1e3eecc
--- /dev/null
+++ b/config/level3.ini
@@ -0,0 +1,42 @@
+#Content Extractor Settings File
+#Thu Feb 10 18:50:26 EST 2005
+<A>\ tags\ are\ substance=true
+<FORM>\ tags\ are\ substance=true
+Ignore\ Image\ Links=true
+Ignore\ Meta\ Tags=true
+Display\ Image\ Link\ ALTs=true
+Ignore\ Text\ Links=true
+<INPUT>\ tags\ are\ substance=true
+Ignore\ <IFRAME>\ Tags=true
+<BUTTON>\ tags\ are\ substance=true
+Ignore\ All\ Advertisements=true
+<SELECT>\ tags\ are\ substance=true
+Ignore\ External\ Stylesheets=false
+<IFRAME>\ tags\ are\ substance=true
+Ignore\ <EMBED>\ tags=true
+Ignore\ Styles=false
+Ignore\ Only\ Links\ and\ Text\ in\ Link\ Lists=true
+Ignore\ <INPUT>\ Tags=true
+Ignore\ Flash=true
+Ignore\ Image\ Links\ in\ Link\ Lists=true
+<IMG>\ tags\ are\ substance=true
+<TEXTAREA>\ tags\ are\ substance=true
+Ignore\ Scripts=true
+Ignore\ Forms=true
+Display\ Image\ ALTs=true
+Maximum\ Number\ of\ Line\ Breaks=2
+Ignore\ <BUTTON>\ Tags=true
+Limit\ Number\ of\ Line\ Breaks=false
+Ignore\ Style\ Attribute\ in\ <DIV>\ Tags=false
+Ignore\ <NOSCRIPT>\ Tags=true
+Add\ removed\ links\ to\ bottom\ of\ the\ page=false
+Minimum\ text\ length\ as\ substance=12
+Remove\ Empty\ Tables=true
+Ignore\ Text\ Links\ in\ Link\ Lists=true
+Ignore\ Table\ Cell\ Widths=true
+Ignore\ Style\ Attributes=false
+Ignore\ <SELECT>\ Tags=true
+Ignore\ Link\ Lists=true
+Link/Text\ Removal\ Ratio=0.25
+Ignore\ Images=true
+Print\ Only\ Text=false
diff --git a/config/level4.ini b/config/level4.ini
new file mode 100644
index 0000000..dbee198
--- /dev/null
+++ b/config/level4.ini
@@ -0,0 +1,42 @@
+#Content Extractor Settings File
+#Thu Feb 10 18:52:26 EST 2005
+<A>\ tags\ are\ substance=true
+<FORM>\ tags\ are\ substance=true
+Ignore\ Image\ Links=false
+Ignore\ Meta\ Tags=true
+Display\ Image\ Link\ ALTs=true
+Ignore\ Text\ Links=false
+<INPUT>\ tags\ are\ substance=true
+Ignore\ <IFRAME>\ Tags=true
+<BUTTON>\ tags\ are\ substance=true
+Ignore\ All\ Advertisements=true
+<SELECT>\ tags\ are\ substance=true
+Ignore\ External\ Stylesheets=false
+<IFRAME>\ tags\ are\ substance=true
+Ignore\ <EMBED>\ tags=true
+Ignore\ Styles=false
+Ignore\ Only\ Links\ and\ Text\ in\ Link\ Lists=true
+Ignore\ <INPUT>\ Tags=true
+Ignore\ Flash=true
+Ignore\ Image\ Links\ in\ Link\ Lists=true
+<IMG>\ tags\ are\ substance=true
+<TEXTAREA>\ tags\ are\ substance=true
+Ignore\ Scripts=true
+Ignore\ Forms=true
+Display\ Image\ ALTs=true
+Maximum\ Number\ of\ Line\ Breaks=2
+Ignore\ <BUTTON>\ Tags=true
+Limit\ Number\ of\ Line\ Breaks=false
+Ignore\ Style\ Attribute\ in\ <DIV>\ Tags=false
+Ignore\ <NOSCRIPT>\ Tags=true
+Add\ removed\ links\ to\ bottom\ of\ the\ page=false
+Minimum\ text\ length\ as\ substance=12
+Remove\ Empty\ Tables=true
+Ignore\ Text\ Links\ in\ Link\ Lists=true
+Ignore\ Table\ Cell\ Widths=true
+Ignore\ Style\ Attributes=false
+Ignore\ <SELECT>\ Tags=true
+Ignore\ Link\ Lists=true
+Link/Text\ Removal\ Ratio=0.25
+Ignore\ Images=false
+Print\ Only\ Text=false
diff --git a/config/level5.ini b/config/level5.ini
new file mode 100644
index 0000000..effeff0
--- /dev/null
+++ b/config/level5.ini
@@ -0,0 +1,42 @@
+#Content Extractor Settings File
+#Thu Feb 10 18:55:51 EST 2005
+<A>\ tags\ are\ substance=true
+<FORM>\ tags\ are\ substance=true
+Ignore\ Image\ Links=false
+Ignore\ Meta\ Tags=true
+Display\ Image\ Link\ ALTs=true
+Ignore\ Text\ Links=false
+<INPUT>\ tags\ are\ substance=true
+Ignore\ <IFRAME>\ Tags=true
+<BUTTON>\ tags\ are\ substance=true
+Ignore\ All\ Advertisements=true
+<SELECT>\ tags\ are\ substance=true
+Ignore\ External\ Stylesheets=false
+<IFRAME>\ tags\ are\ substance=true
+Ignore\ <EMBED>\ tags=true
+Ignore\ Styles=false
+Ignore\ Only\ Links\ and\ Text\ in\ Link\ Lists=true
+Ignore\ <INPUT>\ Tags=true
+Ignore\ Flash=true
+Ignore\ Image\ Links\ in\ Link\ Lists=true
+<IMG>\ tags\ are\ substance=true
+<TEXTAREA>\ tags\ are\ substance=true
+Ignore\ Scripts=true
+Ignore\ Forms=false
+Display\ Image\ ALTs=true
+Maximum\ Number\ of\ Line\ Breaks=2
+Ignore\ <BUTTON>\ Tags=false
+Limit\ Number\ of\ Line\ Breaks=false
+Ignore\ Style\ Attribute\ in\ <DIV>\ Tags=false
+Ignore\ <NOSCRIPT>\ Tags=true
+Add\ removed\ links\ to\ bottom\ of\ the\ page=false
+Minimum\ text\ length\ as\ substance=12
+Remove\ Empty\ Tables=true
+Ignore\ Text\ Links\ in\ Link\ Lists=true
+Ignore\ Table\ Cell\ Widths=true
+Ignore\ Style\ Attributes=false
+Ignore\ <SELECT>\ Tags=true
+Ignore\ Link\ Lists=true
+Link/Text\ Removal\ Ratio=0.25
+Ignore\ Images=false
+Print\ Only\ Text=false
diff --git a/config/level6.ini b/config/level6.ini
new file mode 100644
index 0000000..c426834
--- /dev/null
+++ b/config/level6.ini
@@ -0,0 +1,42 @@
+#Content Extractor Settings File
+#Thu Feb 10 18:55:25 EST 2005
+<A>\ tags\ are\ substance=true
+<FORM>\ tags\ are\ substance=true
+Ignore\ Image\ Links=false
+Ignore\ Meta\ Tags=true
+Display\ Image\ Link\ ALTs=true
+Ignore\ Text\ Links=false
+<INPUT>\ tags\ are\ substance=true
+Ignore\ <IFRAME>\ Tags=true
+<BUTTON>\ tags\ are\ substance=true
+Ignore\ All\ Advertisements=true
+<SELECT>\ tags\ are\ substance=true
+Ignore\ External\ Stylesheets=false
+<IFRAME>\ tags\ are\ substance=true
+Ignore\ <EMBED>\ tags=true
+Ignore\ Styles=false
+Ignore\ Only\ Links\ and\ Text\ in\ Link\ Lists=true
+Ignore\ <INPUT>\ Tags=true
+Ignore\ Flash=true
+Ignore\ Image\ Links\ in\ Link\ Lists=true
+<IMG>\ tags\ are\ substance=true
+<TEXTAREA>\ tags\ are\ substance=true
+Ignore\ Scripts=true
+Ignore\ Forms=false
+Display\ Image\ ALTs=true
+Maximum\ Number\ of\ Line\ Breaks=2
+Ignore\ <BUTTON>\ Tags=false
+Limit\ Number\ of\ Line\ Breaks=false
+Ignore\ Style\ Attribute\ in\ <DIV>\ Tags=false
+Ignore\ <NOSCRIPT>\ Tags=true
+Add\ removed\ links\ to\ bottom\ of\ the\ page=false
+Minimum\ text\ length\ as\ substance=12
+Remove\ Empty\ Tables=true
+Ignore\ Text\ Links\ in\ Link\ Lists=true
+Ignore\ Table\ Cell\ Widths=true
+Ignore\ Style\ Attributes=false
+Ignore\ <SELECT>\ Tags=true
+Ignore\ Link\ Lists=true
+Link/Text\ Removal\ Ratio=0.5
+Ignore\ Images=false
+Print\ Only\ Text=false
diff --git a/config/level7.ini b/config/level7.ini
new file mode 100644
index 0000000..ec6b826
--- /dev/null
+++ b/config/level7.ini
@@ -0,0 +1,42 @@
+#Content Extractor Settings File
+#Thu Feb 10 18:54:44 EST 2005
+<A>\ tags\ are\ substance=true
+<FORM>\ tags\ are\ substance=true
+Ignore\ Image\ Links=false
+Ignore\ Meta\ Tags=true
+Display\ Image\ Link\ ALTs=true
+Ignore\ Text\ Links=false
+<INPUT>\ tags\ are\ substance=true
+Ignore\ <IFRAME>\ Tags=true
+<BUTTON>\ tags\ are\ substance=true
+Ignore\ All\ Advertisements=true
+<SELECT>\ tags\ are\ substance=true
+Ignore\ External\ Stylesheets=false
+<IFRAME>\ tags\ are\ substance=true
+Ignore\ <EMBED>\ tags=true
+Ignore\ Styles=false
+Ignore\ Only\ Links\ and\ Text\ in\ Link\ Lists=true
+Ignore\ <INPUT>\ Tags=true
+Ignore\ Flash=true
+Ignore\ Image\ Links\ in\ Link\ Lists=true
+<IMG>\ tags\ are\ substance=true
+<TEXTAREA>\ tags\ are\ substance=true
+Ignore\ Scripts=true
+Ignore\ Forms=false
+Display\ Image\ ALTs=true
+Maximum\ Number\ of\ Line\ Breaks=2
+Ignore\ <BUTTON>\ Tags=false
+Limit\ Number\ of\ Line\ Breaks=false
+Ignore\ Style\ Attribute\ in\ <DIV>\ Tags=false
+Ignore\ <NOSCRIPT>\ Tags=true
+Add\ removed\ links\ to\ bottom\ of\ the\ page=false
+Minimum\ text\ length\ as\ substance=12
+Remove\ Empty\ Tables=true
+Ignore\ Text\ Links\ in\ Link\ Lists=true
+Ignore\ Table\ Cell\ Widths=true
+Ignore\ Style\ Attributes=false
+Ignore\ <SELECT>\ Tags=true
+Ignore\ Link\ Lists=true
+Link/Text\ Removal\ Ratio=0.75
+Ignore\ Images=false
+Print\ Only\ Text=false
diff --git a/config/level8.ini b/config/level8.ini
new file mode 100644
index 0000000..6573dec
--- /dev/null
+++ b/config/level8.ini
@@ -0,0 +1,42 @@
+#Content Extractor Settings File
+#Thu Feb 10 18:58:17 EST 2005
+<A>\ tags\ are\ substance=true
+<FORM>\ tags\ are\ substance=true
+Ignore\ Image\ Links=false
+Ignore\ Meta\ Tags=true
+Display\ Image\ Link\ ALTs=true
+Ignore\ Text\ Links=false
+<INPUT>\ tags\ are\ substance=true
+Ignore\ <IFRAME>\ Tags=true
+<BUTTON>\ tags\ are\ substance=true
+Ignore\ All\ Advertisements=true
+<SELECT>\ tags\ are\ substance=true
+Ignore\ External\ Stylesheets=false
+<IFRAME>\ tags\ are\ substance=true
+Ignore\ <EMBED>\ tags=true
+Ignore\ Styles=false
+Ignore\ Only\ Links\ and\ Text\ in\ Link\ Lists=true
+Ignore\ <INPUT>\ Tags=true
+Ignore\ Flash=true
+Ignore\ Image\ Links\ in\ Link\ Lists=true
+<IMG>\ tags\ are\ substance=true
+<TEXTAREA>\ tags\ are\ substance=true
+Ignore\ Scripts=true
+Ignore\ Forms=false
+Display\ Image\ ALTs=true
+Maximum\ Number\ of\ Line\ Breaks=2
+Ignore\ <BUTTON>\ Tags=false
+Limit\ Number\ of\ Line\ Breaks=false
+Ignore\ Style\ Attribute\ in\ <DIV>\ Tags=false
+Ignore\ <NOSCRIPT>\ Tags=true
+Add\ removed\ links\ to\ bottom\ of\ the\ page=false
+Minimum\ text\ length\ as\ substance=12
+Remove\ Empty\ Tables=false
+Ignore\ Text\ Links\ in\ Link\ Lists=true
+Ignore\ Table\ Cell\ Widths=true
+Ignore\ Style\ Attributes=false
+Ignore\ <SELECT>\ Tags=true
+Ignore\ Link\ Lists=true
+Link/Text\ Removal\ Ratio=0.75
+Ignore\ Images=false
+Print\ Only\ Text=false
diff --git a/config/level9.ini b/config/level9.ini
new file mode 100644
index 0000000..c7997f4
--- /dev/null
+++ b/config/level9.ini
@@ -0,0 +1,42 @@
+#Content Extractor Settings File
+#Thu Feb 10 18:58:56 EST 2005
+<A>\ tags\ are\ substance=true
+<FORM>\ tags\ are\ substance=true
+Ignore\ Image\ Links=false
+Ignore\ Meta\ Tags=true
+Display\ Image\ Link\ ALTs=true
+Ignore\ Text\ Links=false
+<INPUT>\ tags\ are\ substance=true
+Ignore\ <IFRAME>\ Tags=true
+<BUTTON>\ tags\ are\ substance=true
+Ignore\ All\ Advertisements=true
+<SELECT>\ tags\ are\ substance=true
+Ignore\ External\ Stylesheets=false
+<IFRAME>\ tags\ are\ substance=true
+Ignore\ <EMBED>\ tags=true
+Ignore\ Styles=false
+Ignore\ Only\ Links\ and\ Text\ in\ Link\ Lists=true
+Ignore\ <INPUT>\ Tags=true
+Ignore\ Flash=true
+Ignore\ Image\ Links\ in\ Link\ Lists=true
+<IMG>\ tags\ are\ substance=true
+<TEXTAREA>\ tags\ are\ substance=true
+Ignore\ Scripts=true
+Ignore\ Forms=false
+Display\ Image\ ALTs=true
+Maximum\ Number\ of\ Line\ Breaks=2
+Ignore\ <BUTTON>\ Tags=false
+Limit\ Number\ of\ Line\ Breaks=false
+Ignore\ Style\ Attribute\ in\ <DIV>\ Tags=false
+Ignore\ <NOSCRIPT>\ Tags=true
+Add\ removed\ links\ to\ bottom\ of\ the\ page=false
+Minimum\ text\ length\ as\ substance=12
+Remove\ Empty\ Tables=false
+Ignore\ Text\ Links\ in\ Link\ Lists=true
+Ignore\ Table\ Cell\ Widths=true
+Ignore\ Style\ Attributes=false
+Ignore\ <SELECT>\ Tags=true
+Ignore\ Link\ Lists=false
+Link/Text\ Removal\ Ratio=0.75
+Ignore\ Images=false
+Print\ Only\ Text=false
diff --git a/src/psl/crunch3/plugins/contentextractor/ContentExtractor.java b/src/psl/crunch3/plugins/contentextractor/ContentExtractor.java
index 70a41f7..265b633 100644
--- a/src/psl/crunch3/plugins/contentextractor/ContentExtractor.java
+++ b/src/psl/crunch3/plugins/contentextractor/ContentExtractor.java
@@ -14,6 +14,7 @@ import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.OutputStream;
 import java.io.PrintWriter;
+import java.util.Vector;
 import java.net.URL;
 import java.util.LinkedList;
 import java.util.Iterator;
@@ -50,6 +51,20 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende
 	public static final String LINK_HEAVY_SETTINGS_FILE_DEF = "config" + File.separator + "link heavy.ini";
 	public static final String AUTOMATIC_SETTINGS_FILE_DEF = "config" + File.separator + "automatic.ini";
 	public static final String CUSTOM_SETTINGS_FILE_DEF = "config" + File.separator + "custom.ini";
+
+	public static final String LEVEL1_SETTINGS_FILE_DEF = "config" + File.separator + "level1.ini";
+	public static final String LEVEL2_SETTINGS_FILE_DEF = "config" + File.separator + "level2.ini";
+	public static final String LEVEL3_SETTINGS_FILE_DEF = "config" + File.separator + "level3.ini";
+	public static final String LEVEL4_SETTINGS_FILE_DEF = "config" + File.separator + "level4.ini";
+	public static final String LEVEL5_SETTINGS_FILE_DEF = "config" + File.separator + "level5.ini";
+	public static final String LEVEL6_SETTINGS_FILE_DEF = "config" + File.separator + "level6.ini";
+	public static final String LEVEL7_SETTINGS_FILE_DEF = "config" + File.separator + "level7.ini";
+	public static final String LEVEL8_SETTINGS_FILE_DEF = "config" + File.separator + "level8.ini";
+	public static final String LEVEL9_SETTINGS_FILE_DEF = "config" + File.separator + "level9.ini";
+	public static final String LEVEL10_SETTINGS_FILE_DEF = "config" + File.separator + "level10.ini";
+	public static final String LEVEL11_SETTINGS_FILE_DEF = "config" + File.separator + "level11.ini";
+	public static final String LEVEL12_SETTINGS_FILE_DEF = "config" + File.separator + "level12.ini";
+
 	public static final String CONTENT_TEXT = "text/plain";
 	public static final String CONTENT_HTML = "text/html";

@@ -75,7 +90,9 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende

 	ContentExtractorSettings settings; // the settings
 	ContentExtractorDescriptionGUI descriptionGUI; // the description GUI
-
+	private Vector visitedClusters;
+
+
 	/**
 	 * Creates a new instance without any input stream and the default settings file.
 	 */
@@ -104,6 +121,8 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende
 		mLinksSourceAll = new LinkedList();
 		mLinksTextAll = new LinkedList();
 		mImagesSource = new LinkedList();
+
+		visitedClusters = new Vector();
 	}


@@ -1619,58 +1638,112 @@ public class ContentExtractor extends EnhancedProxyFilter implements SiteDepende

 		if (descriptionGUI.isAuto()){
 			//check what cluster the site belongs to and determine the correct filter.
-			WordCount wc = new WordCount(URL.substring(7), descriptionGUI.getFrequencies(),
-					descriptionGUI.getKeys(), descriptionGUI.getSites(), descriptionGUI.getEngineNumber());
+			int cluster = 0;

-			int cluster = descriptionGUI.getCluster(wc.getClosestSite());
-			System.out.println(cluster);
+			//check if the site is already clustered.
+			if((cluster = descriptionGUI.getCluster(WordCount.parseURL(URL.substring(7),true))) != 0){
+				System.out.println(URL +" is already clustered");
+			}
+			else{

+				WordCount wc = new WordCount(URL.substring(7), descriptionGUI.getFrequencies(),
+						descriptionGUI.getKeys(), descriptionGUI.getSites(), descriptionGUI.getEngineNumber());
+				String closest = wc.getClosestSite();
+				if(closest !=null)
+					cluster = descriptionGUI.getCluster(closest);
+				else cluster = 0;
+
+				System.out.println(cluster);


-			switch(cluster){
+			}

-				case 1: descriptionGUI.commitSettings(ContentExtractor.SHOPPING_SETTINGS_FILE_DEF);
-						break;
-				case 2: descriptionGUI.commitSettings(ContentExtractor.SHOPPING_SETTINGS_FILE_DEF);
-						break;
-				case 3: descriptionGUI.commitSettings(ContentExtractor.SHOPPING_SETTINGS_FILE_DEF);
-						break;
-				case 4: descriptionGUI.commitSettings(ContentExtractor.SHOPPING_SETTINGS_FILE_DEF);
-						break;
-				case 5: descriptionGUI.commitSettings(ContentExtractor.SHOPPING_SETTINGS_FILE_DEF);
-						break;
-				case 6: descriptionGUI.commitSettings(ContentExtractor.SHOPPING_SETTINGS_FILE_DEF);
-						break;
-				case 7: descriptionGUI.commitSettings(ContentExtractor.SHOPPING_SETTINGS_FILE_DEF);
-						break;
-				case 8: descriptionGUI.commitSettings(ContentExtractor.SHOPPING_SETTINGS_FILE_DEF);
-						break;
-				case 9: descriptionGUI.commitSettings(ContentExtractor.NEWS_SETTINGS_FILE_DEF);
-						break;
-				case 10: descriptionGUI.commitSettings(ContentExtractor.SHOPPING_SETTINGS_FILE_DEF);
-					 	 break;
-				case 11: descriptionGUI.commitSettings(ContentExtractor.SHOPPING_SETTINGS_FILE_DEF);
-						 break;
-				case 12: descriptionGUI.commitSettings(ContentExtractor.SHOPPING_SETTINGS_FILE_DEF);
-						 break;
-				case 13: descriptionGUI.commitSettings(ContentExtractor.SHOPPING_SETTINGS_FILE_DEF);
-						break;
-				case 14: descriptionGUI.commitSettings(ContentExtractor.SHOPPING_SETTINGS_FILE_DEF);
-						 break;
-				case 15: descriptionGUI.commitSettings(ContentExtractor.SHOPPING_SETTINGS_FILE_DEF);
-						break;
-				default: descriptionGUI.commitSettings(ContentExtractor.SHOPPING_SETTINGS_FILE_DEF);
-						break;
+		    visitedClusters.addElement(new Integer(cluster));
+		    if(visitedClusters.size()>3){
+		    	visitedClusters.removeElementAt(0);
+		    }
+
+		    //this method should be changed after each run of WordCount.
+		    applySettings(cluster);
+

+			//if the page is a news home page change the setting to level 6
+			if(hpt.isHomePage()){
+				int level = descriptionGUI.getSettingLevel();
+				if(level ==2){
+					level+=4;
+					System.out.println("@@@@@@@@@@" + level);
+					descriptionGUI.commitSettings("config" + File.separator + "level" + level + ".ini", level);
+				}
 			}

+			if(visitedClusters.size()==3){
+			//if surfing is random then relax the setting
+				if(isRandomSurfing()){
+					System.out.println("Random Surfing Detected");
+					relax();
+				}
+			}
+
+
 		}
 	}

+	private void relax(){
+		descriptionGUI.commitSettings("config" + File.separator + "level" + 9 + ".ini", 9);
+	}
+
 	public void selectCustom(){
 		descriptionGUI.selectCustom();
 	}

+	private void applySettings(int cluster){
+		switch(cluster){
+
+			case 1: descriptionGUI.commitSettings(ContentExtractor.LEVEL10_SETTINGS_FILE_DEF, 10);
+					break;
+			case 2: descriptionGUI.commitSettings(ContentExtractor.LEVEL6_SETTINGS_FILE_DEF , 10);
+					break;
+			case 3: descriptionGUI.commitSettings(ContentExtractor.LEVEL6_SETTINGS_FILE_DEF , 10);
+					break;
+			case 4: descriptionGUI.commitSettings(ContentExtractor.LEVEL6_SETTINGS_FILE_DEF , 10);
+					break;
+			case 5: descriptionGUI.commitSettings(ContentExtractor.LEVEL2_SETTINGS_FILE_DEF , 2);
+					break;
+			case 6: descriptionGUI.commitSettings(ContentExtractor.LEVEL6_SETTINGS_FILE_DEF , 10);
+					break;
+			case 7: descriptionGUI.commitSettings(ContentExtractor.LEVEL2_SETTINGS_FILE_DEF , 2);
+					break;
+			case 8: descriptionGUI.commitSettings(ContentExtractor.LEVEL6_SETTINGS_FILE_DEF , 10);
+					break;
+			case 9: descriptionGUI.commitSettings(ContentExtractor.LEVEL2_SETTINGS_FILE_DEF , 2);
+					break;
+			case 10: descriptionGUI.commitSettings(ContentExtractor.LEVEL6_SETTINGS_FILE_DEF , 10);
+				 	 break;
+			case 11: descriptionGUI.commitSettings(ContentExtractor.LEVEL6_SETTINGS_FILE_DEF , 10);
+					 break;
+			case 12: descriptionGUI.commitSettings(ContentExtractor.LEVEL6_SETTINGS_FILE_DEF , 10);
+					 break;
+			case 13: descriptionGUI.commitSettings(ContentExtractor.LEVEL6_SETTINGS_FILE_DEF , 10);
+					break;
+			case 14: descriptionGUI.commitSettings(ContentExtractor.LEVEL2_SETTINGS_FILE_DEF , 2);
+					 break;
+			case 15: descriptionGUI.commitSettings(ContentExtractor.LEVEL6_SETTINGS_FILE_DEF , 10);
+					break;
+			default: descriptionGUI.commitSettings(ContentExtractor.LEVEL12_SETTINGS_FILE_DEF , 12);
+					break;
+
+		}
+	}
+
+	private boolean isRandomSurfing(){
+		int current = ((Integer)(visitedClusters.elementAt(2))).intValue();
+		int prev1 = ((Integer)(visitedClusters.elementAt(1))).intValue();
+		int prev2 = ((Integer)(visitedClusters.elementAt(0))).intValue();
+		System.out.println(current + "  " + prev1 + "  "+ prev2);
+		if((current != prev1) && (current != prev2)) return true;
+		else return false;
+	}


 } //ContentExtractor
diff --git a/src/psl/crunch3/plugins/contentextractor/ContentExtractorDescriptionGUI.java b/src/psl/crunch3/plugins/contentextractor/ContentExtractorDescriptionGUI.java
index 968b8a9..1bfa5ef 100644
--- a/src/psl/crunch3/plugins/contentextractor/ContentExtractorDescriptionGUI.java
+++ b/src/psl/crunch3/plugins/contentextractor/ContentExtractorDescriptionGUI.java
@@ -75,6 +75,7 @@ public class ContentExtractorDescriptionGUI {
 	private Vector names;
 	private Hashtable clusters;
 	private int engineNumber = 5;
+	private int settingLevel = 0;

 	/**
 	 * @param c
@@ -326,32 +327,32 @@ public class ContentExtractorDescriptionGUI {
 	}

 	private void newsButton_widgetSelected(SelectionEvent e) {
-		commitSettings(ContentExtractor.NEWS_SETTINGS_FILE_DEF);
+		commitSettings(ContentExtractor.NEWS_SETTINGS_FILE_DEF, 0);
 		isAuto = false;
 	}

 	protected void shoppingButton_widgetSelected(SelectionEvent e) {
-		commitSettings(ContentExtractor.SHOPPING_SETTINGS_FILE_DEF);
+		commitSettings(ContentExtractor.SHOPPING_SETTINGS_FILE_DEF , 0);
 		isAuto = false;
 	}

 	protected void governmentButton_widgetSelected(SelectionEvent e) {
-		commitSettings(ContentExtractor.GOVERNMENT_SETTINGS_FILE_DEF);
+		commitSettings(ContentExtractor.GOVERNMENT_SETTINGS_FILE_DEF , 0);
 		isAuto = false;
 	}

 	protected void educationButton_widgetSelected(SelectionEvent e) {
-		commitSettings(ContentExtractor.EDUCATION_SETTINGS_FILE_DEF);
+		commitSettings(ContentExtractor.EDUCATION_SETTINGS_FILE_DEF , 0);
 		isAuto = false;
 	}

 	protected void textHeavyButton_widgetSelected(SelectionEvent e) {
-		commitSettings(ContentExtractor.TEXT_HEAVY_SETTINGS_FILE_DEF);
+		commitSettings(ContentExtractor.TEXT_HEAVY_SETTINGS_FILE_DEF , 0);
 		isAuto = false;
 	}

 	protected void linkHeavyButton_widgetSelected(SelectionEvent e) {
-		commitSettings(ContentExtractor.LINK_HEAVY_SETTINGS_FILE_DEF);
+		commitSettings(ContentExtractor.LINK_HEAVY_SETTINGS_FILE_DEF, 0);
 		isAuto = false;
 	}

@@ -393,12 +394,18 @@ public class ContentExtractorDescriptionGUI {
 			}
 			in.close();

-			int index;
+			String part1;
+			int index1, index2;
 			in = new BufferedReader(new FileReader(new File("clusters.txt")));
 			while((word =in.readLine())!=null){
-				index = word.indexOf(" ");
-				clusters.put(word.substring(0,index),
-						word.substring(index+1));
+				index1 = word.indexOf(" ");
+				part1 = word.substring(index1+1);
+				index2 = part1.indexOf(" ");
+				System.out.println(index2);
+				clusters.put(word.substring(0,index1),
+						new ClusterInfo((Integer.parseInt(part1.substring(0,index2))),
+								(Integer.parseInt(part1.substring(index2+1))))
+						);
 			}
 			in.close();

@@ -407,11 +414,11 @@ public class ContentExtractorDescriptionGUI {
 		catch(Exception ex){
 			ex.printStackTrace();
 		}
-		commitSettings(ContentExtractor.AUTOMATIC_SETTINGS_FILE_DEF);
+
 	}

 	protected void customButton_widgetSelected(SelectionEvent e) {
-		commitSettings(ContentExtractor.CUSTOM_SETTINGS_FILE_DEF);
+		commitSettings(ContentExtractor.CUSTOM_SETTINGS_FILE_DEF , 0);
 		isAuto = false;
 	}

@@ -562,18 +569,27 @@ public class ContentExtractorDescriptionGUI {
     }

     public int getCluster(String closest){
-    	return Integer.parseInt((String)clusters.get(closest));
+    	ClusterInfo temp = (ClusterInfo)clusters.get(closest);
+    	if (temp != null)
+    		return temp.clusterNum;
+    	else return 0;
     }

     public int getEngineNumber(){
     	return engineNumber;
     }

+    public int getSettingLevel(){
+    	return settingLevel;
+    }
+
+
+
     /**
      * Changes the filter settings to new settings read from a file.
      * @param fileName the file containing the new filter settings.
      */
-    public void commitSettings(String fileName) {
+    public void commitSettings(String fileName , int level) {
     	File nSettingsFile = new File(fileName);
     	TypedProperties nSettings = new TypedProperties();
     	try {
@@ -584,6 +600,9 @@ public class ContentExtractorDescriptionGUI {
 		} catch (IOException e) {
 			e.printStackTrace();
 		}
+
+		settingLevel = level;
+
     	newFilter.changeSetting(ContentExtractorConstants.ONLY_TEXT, Boolean.toString
     			(nSettings.getProperty(ContentExtractorConstants.ONLY_TEXT, ContentExtractorConstants.ONLY_TEXT_DEF)));
     	newFilter.changeSetting(ContentExtractorConstants.IGNORE_ADS,Boolean.toString
@@ -682,4 +701,15 @@ public class ContentExtractorDescriptionGUI {

 	}

+
+    class ClusterInfo{
+    	private int clusterNum;
+    	private int level;
+
+    	ClusterInfo(int num, int level){
+    		clusterNum = num;
+    		this.level = level;
+    	}
+
+    }
 }
diff --git a/src/psl/crunch3/util/WordCount.java b/src/psl/crunch3/util/WordCount.java
index bed3d1a..06b8617 100644
--- a/src/psl/crunch3/util/WordCount.java
+++ b/src/psl/crunch3/util/WordCount.java
@@ -172,8 +172,13 @@ public class WordCount extends JFrame{
 			}

 			sortDistance(distances,0, distances.size()-1);
-			System.out.println("the closest site is " + ((Distance)distances.elementAt(distances.size()-1)).site1);
-			closestSite = ((Distance)distances.elementAt(distances.size()-1)).site1;
+			Distance temp = (Distance)distances.elementAt(distances.size()-1);
+			System.out.println("the closest site is " + temp.site1);
+			System.out.println("the distance is " + temp.distance);
+			if(temp.distance <600)
+				closestSite = temp.site1;
+			else
+				closestSite = null;
 			names.removeElementAt(0);

 		}
@@ -191,7 +196,6 @@ public class WordCount extends JFrame{
 		System.out.println("generating word list for " + url);
 		BufferedReader in = getWebsite(url);
 		InputStreamReader read;
-		System.out.println("********************    " + engineNum);
 		try{
 			storeBuffer(in);
 			in.close();
@@ -604,6 +608,7 @@ public class WordCount extends JFrame{
 	public BufferedReader getWebsite(String address){
 		try{
 			URL url = new URL("http://" + address);
+
 			return new BufferedReader(new InputStreamReader(url.openStream()));
 		}
 		catch(Exception e){
@@ -680,7 +685,7 @@ public class WordCount extends JFrame{
 	/*
 	 * return the host name given the url string (assumes no http://...)
 	 */
-	private String parseURL(String url, boolean isRoot){
+	public static String parseURL(String url, boolean isRoot){
 		int i = url.indexOf('\\');
 		String first=url;
 		String second = "";
@@ -914,7 +919,7 @@ public class WordCount extends JFrame{
 				out.writeBytes("The elements in cluster #" + (i+1) + " are: \n");
 				for(int j=0;j<temp.size();j++){
 					cn = (ClusterNode)temp.elementAt(j);
-					os.writeBytes(cn.site + " " + (i+1) + "\n");
+					os.writeBytes(cn.site + " " + (i+1) + " " + cn.level + "\n");
 					System.out.println(cn.site + "\t" + cn.level +
 						"\t" + cn.pulled + "\t" + cn.distance);
 					out.writeBytes(cn.site + "\t" + cn.level +