Java学习者论坛

 找回密码
 立即注册

QQ登录

只需一步,快速开始

手机号码,快捷登录

恭喜Java学习者论坛(https://www.javaxxz.com)已经为数万Java学习者服务超过8年了!积累会员资料超过10000G+
成为本站VIP会员,下载本站10000G+会员资源,购买链接:点击进入购买VIP会员
JAVA高级面试进阶视频教程Java架构师系统进阶VIP课程

分布式高可用全栈开发微服务教程

Go语言视频零基础入门到精通

Java架构师3期(课件+源码)

Java开发全终端实战租房项目视频教程

SpringBoot2.X入门到高级使用教程

大数据培训第六期全套视频教程

深度学习(CNN RNN GAN)算法原理

Java亿级流量电商系统视频教程

互联网架构师视频教程

年薪50万Spark2.0从入门到精通

年薪50万!人工智能学习路线教程

年薪50万!大数据从入门到精通学习路线年薪50万!机器学习入门到精通视频教程
仿小米商城类app和小程序视频教程深度学习数据分析基础到实战最新黑马javaEE2.1就业课程从 0到JVM实战高手教程 MySQL入门到精通教程
查看: 2093|回复: 0

使用java模拟GOOGLE搜索引擎爬虫抓取网页

[复制链接]

该用户从未签到

发表于 2011-7-25 20:57:17 | 显示全部楼层 |阅读模式
package com.spider.obj;
import java.util.*;
import java.net.*;
import java.io.*;
import javax.swing.text.*;
import javax.swing.text.HTML.*;
public class Spider {
protected Collection workloadError = new ArrayList(3);
protected Collection workloadWaiting = new ArrayList(3);
protected Collection workloadProcessed = new ArrayList(3);
protected ISpiderReportable report;
protected boolean cancel = false;
public Spider(ISpiderReportable report)
{
this.report = report;
}
public Collection getWorkloadError()
{
return workloadError;
}
public Collection getWorkloadWaiting()
{
return workloadWaiting;
}
public Collection getWorkloadProcessed()
{
return workloadProcessed;
}   
public void clear()
{
getWorkloadError().clear();
getWorkloadWaiting().clear();
getWorkloadProcessed().clear();
}
public void cancel()
{
cancel = true;
}
public void addURL(URL url)
{
if ( getWorkloadWaiting().contains(url) )
return;
if ( getWorkloadError().contains(url) )
return;
if ( getWorkloadProcessed().contains(url) )
return;
log("Adding to workload: " + url );
getWorkloadWaiting().add(url);
}
public void processURL(URL url)
{
try {
log(&quotrocessing: " + url );
// get the URL's contents
URLConnection connection = url.openConnection();
if ( (connection.getContentType()!=null) &&
    !connection.getContentType().toLowerCase().startsWith("text/") ) {
getWorkloadWaiting().remove(url);
getWorkloadProcessed().add(url);
log("Not processing because content type is: " +
      connection.getContentType() );
return;
}
// read the URL
InputStream is = connection.getInputStream();
Reader r = new InputStreamReader(is);
// parse the URL
HTMLEditorKit.Parser parse = new HTMLParse().getParser();
parse.parse(r,new Parser(url),true);
} catch ( IOException e ) {
getWorkloadWaiting().remove(url);
getWorkloadError().add(url);
log("Error: " + url );
report.spiderURLError(url);
return;
}
// mark URL as complete
getWorkloadWaiting().remove(url);
getWorkloadProcessed().add(url);
log("Complete: " + url );
}
public void begin()
{
cancel = false;
while ( !getWorkloadWaiting().isEmpty() && !cancel ) {
Object list[] = getWorkloadWaiting().toArray();
for ( int i=0;(i<list.length)&&!cancel;i++ )
   processURL((URL)list);
}
}
protected class Parser
extends HTMLEditorKit.ParserCallback {
protected URL base;
public Parser(URL base)
{
this.base = base;
}
public void handleSimpleTag(HTML.Tag t,
                         MutableAttributeSet a,int pos)
{
String href = (String)a.getAttribute(HTML.Attribute.HREF);
if( (href==null) && (t==HTML.Tag.FRAME) )
href = (String)a.getAttribute(HTML.Attribute.SRC);
if ( href==null )
return;
int i = href.indexOf('#');
if ( i!=-1 )
href = href.substring(0,i);
if ( href.toLowerCase().startsWith("mailto:") ) {
report.spiderFoundEMail(href);
return;
}
if(t==HTML.Tag.META)
{
String title = (String)a.getAttribute(HTML.Attribute.NAME);
System.out.println("title:"+title);
}
handleLink(base,href);
}
public void handleStartTag(HTML.Tag t,
                        MutableAttributeSet a,int pos)
{
handleSimpleTag(t,a,pos);    // handle the same way
}
protected void handleLink(URL base,String str)
{
try {
URL url = new URL(base,str);
if ( report.spiderFoundURL(base,url) )
   addURL(url);
} catch ( MalformedURLException e ) {
log("Found malformed URL: " + str );
}
}
}
public void log(String entry)
{
System.out.println( (new Date()) + ":" + entry );
}
}

package com.spider.obj;
import java.net.*;
interface ISpiderReportable {
  public boolean spiderFoundURL(URL base,URL url);
  public void spiderURLError(URL url);
  public void spiderFoundEMail(String email);
}
package com.spider.obj;
import javax.swing.text.html.*;
public class HTMLParse extends HTMLEditorKit {
  public HTMLEditorKit.Parser getParser()
  {
    return super.getParser();
  }
}
package com.spider.obj;
import java.awt.*;
import javax.swing.*;
import java.net.*;
import java.io.*;
public class CheckLinks extends javax.swing.JFrame implements
      Runnable,ISpiderReportable {


public CheckLinks()
{
//{{INIT_CONTROLS

setTitle("Find Broken Links");
getContentPane().setLayout(null);
setSize(405,288);
setVisible(true);
label1.setText("Enter a URL:");
getContentPane().add(label1);
label1.setBounds(12,12,84,12);
begin.setText("Begin");
begin.setActionCommand("Begin");
getContentPane().add(begin);
begin.setBounds(12,36,84,24);
getContentPane().add(url);
url.setBounds(108,36,288,24);
errorScroll.setAutoscrolls(true);
errorScroll.setHorizontalScrollBarPolicy(javax.swing.
         ScrollPaneConstants.HORIZONTAL_SCROLLBAR_ALWAYS);
errorScroll.setVerticalScrollBarPolicy(javax.swing.
         ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS);
errorScroll.setOpaque(true);
getContentPane().add(errorScroll);
errorScroll.setBounds(12,120,384,156);
errors.setEditable(false);
errorScroll.getViewport().add(errors);
errors.setBounds(0,0,366,138);
current.setText("Currently Processing: ");
getContentPane().add(current);
current.setBounds(12,72,384,12);
goodLinksLabel.setText("Good Links: 0");
getContentPane().add(goodLinksLabel);
goodLinksLabel.setBounds(12,96,192,12);
badLinksLabel.setText("Bad Links: 0");
getContentPane().add(badLinksLabel);
badLinksLabel.setBounds(216,96,96,12);
//}}
//{{INIT_MENUS
//}}

//{{REGISTER_LISTENERS
SymAction lSymAction = new SymAction();
begin.addActionListener(lSymAction);
//}
}

static public void main(String args[])
{
(new CheckLinks()).setVisible(true);
}


public void addNotify()
{
// Record the size of the window prior to calling parent's
// addNotify.
Dimension size = getSize();

super.addNotify();

if ( frameSizeAdjusted )
return;
frameSizeAdjusted = true;

//Adjust size of frame according to the insets and menu bar
Insets insets = getInsets();
javax.swing.JMenuBar menuBar = getRootPane().getJMenuBar();
int menuBarHeight = 0;
if ( menuBar != null )
menuBarHeight = menuBar.getPreferredSize().height;
setSize(insets.left + insets.right + size.width, insets.top +
                     insets.bottom + size.height +
                     menuBarHeight);
}

// Used by addNotify
boolean frameSizeAdjusted = false;

//{{DECLARE_CONTROLS
javax.swing.JLabel label1 = new javax.swing.JLabel();


javax.swing.JButton begin = new javax.swing.JButton();


javax.swing.JTextField url = new javax.swing.JTextField();


javax.swing.JScrollPane errorScroll =
   new javax.swing.JScrollPane();


javax.swing.JTextArea errors = new javax.swing.JTextArea();
javax.swing.JLabel current = new javax.swing.JLabel();
javax.swing.JLabel goodLinksLabel = new javax.swing.JLabel();
javax.swing.JLabel badLinksLabel = new javax.swing.JLabel();
//}}

//{{DECLARE_MENUS
//}}


protected Thread backgroundThread;


protected Spider spider;


protected URL base;


protected int badLinksCount = 0;


protected int goodLinksCount = 0;
class SymAction implements java.awt.event.ActionListener {
public void actionPerformed(java.awt.event.ActionEvent event)
{
   Object object = event.getSource();
   if ( object == begin )
     begin_actionPerformed(event);
}
}


void begin_actionPerformed(java.awt.event.ActionEvent event)
{
if ( backgroundThread==null ) {
   begin.setLabel("Cancel");
   backgroundThread = new Thread(this);
   backgroundThread.start();
   goodLinksCount=0;
   badLinksCount=0;
} else {
   spider.cancel();
}

}


public void run()
{
try {
   errors.setText("");
   spider = new Spider(this);
   spider.clear();
   base = new URL(url.getText());
   spider.addURL(base);
   spider.begin();
   Runnable doLater = new Runnable()
   {
     public void run()
     {
       begin.setText("Begin");
     }
   };
   SwingUtilities.invokeLater(doLater);
   backgroundThread=null;

} catch ( MalformedURLException e ) {
   UpdateErrors err = new UpdateErrors();
   err.msg = "Bad address.";
   SwingUtilities.invokeLater(err);

}
}


public boolean spiderFoundURL(URL base,URL url)
{
UpdateCurrentStats cs = new UpdateCurrentStats();
cs.msg = url.toString();
SwingUtilities.invokeLater(cs);

if ( !checkLink(url) ) {
   UpdateErrors err = new UpdateErrors();
   err.msg = url+"(on page " + base + ")\n";
   SwingUtilities.invokeLater(err);
   badLinksCount++;
   return false;
}

goodLinksCount++;
if ( !url.getHost().equalsIgnoreCase(base.getHost()) )
   return false;
else
   return true;
}


public void spiderURLError(URL url)
{
System.out.println("没找到");
}


protected boolean checkLink(URL url)
{
try {
   URLConnection connection = url.openConnection();
   connection.connect();
   return true;
} catch ( IOException e ) {
   return false;
}
}


public void spiderFoundEMail(String email)
{
System.out.println("");
}


class UpdateErrors implements Runnable {
public String msg;
public void run()
{
   errors.append(msg);
}
}


class UpdateCurrentStats implements Runnable {
public String msg;
public void run()
{
   current.setText("Currently Processing: " + msg );
   goodLinksLabel.setText("Good Links: " + goodLinksCount);
   badLinksLabel.setText("Bad Links: " + badLinksCount);
}
}
}

回复

使用道具 举报

  • TA的每日心情
    开心
    2021-3-12 23:18
  • 签到天数: 2 天

    [LV.1]初来乍到

    发表于 2011-7-25 23:38:56 | 显示全部楼层
    不错,学些了。
    回复 支持 反对

    使用道具 举报

    该用户从未签到

    发表于 2011-7-25 23:39:30 | 显示全部楼层
    很好的帖子。大家相互学习了。
    回复 支持 反对

    使用道具 举报

    该用户从未签到

    发表于 2011-7-25 23:39:51 | 显示全部楼层
    很好,应该设为精华。
    回复 支持 反对

    使用道具 举报

    您需要登录后才可以回帖 登录 | 立即注册

    本版积分规则

    QQ|手机版|Java学习者论坛 ( 声明:本站资料整理自互联网,用于Java学习者交流学习使用,对资料版权不负任何法律责任,若有侵权请及时联系客服屏蔽删除 )

    GMT+8, 2025-1-22 14:59 , Processed in 0.513506 second(s), 48 queries .

    Powered by Discuz! X3.4

    © 2001-2017 Comsenz Inc.

    快速回复 返回顶部 返回列表