您现在的位置是:首页
>
如何利用web 利用C#实现web信息自动抓取
利用C#实现we 信息自动抓取 背景 随着I ter et的普及 网络信息正以极高的速度增长 在这么多数据中找到自己需要的信息是一件很繁琐的事情 找到需要的信息后如何获取也是件麻烦的事 这就需要I
利用C#实现web信息自动抓取

背景 随着Internet的普及 网络信息正以极高的速度增长 在这么多数据中找到自己需要的信息是一件很繁琐的事情 找到需要的信息后如何获取也是件麻烦的事 这就需要Internet信息抓取程序来代替人工的操作 所谓Internet信息抓取程序 就是程序会按照用户的关键词或关键网站来收集相应的信息 并提供给用户想要的信息格式 信息量的增加会带来信息网站发布人员工作量的剧增 为实现信息发布系统实现信息自 动发布 减少工作人员工作量 即时跟踪最新信息 就需要自动信息提供程序 因此Internet信息抓取程序应运而生 目标 实现自定义网站信息分类抓取 存入本地数据库 生成静态页面或其它用户定义的信息结构 并下载与信息相关的多媒体文件 开发 目标站点结构分析 本步骤是准确抓取信息个关键 首先要选择更新频率高的页面做为抓取地址 然后分析要抓取内容页面url特点 然后分析要抓取信息页面的元素特性 比如标题位置 内容位置 等 得到定位标记点 将以上信息写成自己的配置文件或存到数据库中 每个网站都需要分析 写出单独的配置文件 供抓取程序使用 信息提取 根据配置文件取得要抓取页面url 使用HttpWebRequest类获取内容 //获取页面函数 public string Get_Http(string a_strUrl int timeout) { string strResult ; try { HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest Create(a_strUrl) ; myReq Timeout = timeout; HttpWebResponse HttpWResp = (HttpWebResponse)myReq GetResponse(); Stream myStream = HttpWResp GetResponseStream () ; StreamReader sr = new StreamReader(myStream Encoding Default); StringBuilder strBuilder = new StringBuilder(); while ( != sr Peek()) { strBuilder Append(sr ReadLine()+ rn ); } strResult = strBuilder ToString(); } catch(Exception exp) { strResult = 错误 + exp Message ; } return strResult ; } 获取页面内容后 分析页面中连接地址取到要抓取的url //处理页面标题和链接 public string SniffWebUrl( string urlStr string blockB string blockE ) { string urlch = ; string urlch = ; int end_n = ; int end_nums = ; int end_nums = ; int end_nums = ; int end_nums = ; string reUTStr = ; string reTitle = ; string ret = ; try { int pos = urlStr IndexOf( ); int pos = urlStr LastIndexOf( / ); if( pos < ) { return ; } if( pos < ) { return ; } int pos = urlStr IndexOf( / pos ); if ( pos < ) { urlch = urlStr; urlch = urlStr; } else { urlch = urlStr Substring( pos ); urlch = urlStr Substring( pos ); } string tmpAllStr = new PublicFun() Get_Http( urlStr time ); int pos = tmpAllStr IndexOf( blockB ); int pos = tmpAllStr IndexOf( blockE pos + blockB Length ); if ( pos > && pos > && pos >pos ) { ret = tmpAllStr Substring( pos + blockB Length pos pos blockB Length ); ret = ret Substring( ret IndexOf( < )); while( ret IndexOf( <A ) >= ) { ret = ret Substring( ret IndexOf( <A ) ) + <a + ret Substring( ret IndexOf( <A ) + ); } while( ret IndexOf( </A ) >= ) { ret = ret Substring( ret IndexOf( </A ) ) + </a + ret Substring( ret IndexOf( </A ) + ); } while( ret IndexOf( Href= ) >= ) { ret = ret Substring( ret IndexOf( Href= )) + + ret Substring( ret IndexOf( Href= ) + ); } while( ret IndexOf( HREF= ) >= ) { ret = ret Substring( ret IndexOf( HREF= )) + + ret Substring( ret IndexOf( HREF= ) + ); } while( ret IndexOf( ) >= ) { ret = ret Substring( ret IndexOf( )) + href= + ret Substring( ret IndexOf( ) + ); } } tmpAllStr = ret; int begin_nums = tmpAllStr IndexOf( ); while ( begin_nums >= ) { string tmpStrA = ; string tmpStrB = tmpAllStr Substring( begin_nums + ); if ( tmpStrB == ) { end_n = begin_nums + ; if ( ( end_n + ) > tmpAllStr Length ) { return ; } tmpStrA = tmpAllStr Substring( begin_nums+ ); } else { end_n = begin_nums + ; tmpStrA = tmpStrB; } if ( tmpStrA == # ) { tmpAllStr = tmpAllStr Substring( end_n ); begin_nums = tmpAllStr IndexOf( ); } else { end_nums = tmpAllStr IndexOf( end_n ); end_nums = tmpAllStr IndexOf( > end_n ); end_nums = tmpAllStr IndexOf( </a end_nums ); if ( ( end_nums >= ) && ( end_nums >= ) ) { reTitle = tmpAllStr Substring( end_nums + end_nums end_nums ); if ( end_nums > end_nums ) { end_nums = end_nums ; } else { if ( end_nums < ) { end_nums = end_nums ; } else { end_nums = end_nums ; } } string str = tmpAllStr Substring( end_nums end_nums end_nums + ); if ( str == || str == ) { end_nums = end_nums ; } string sTotalOne = tmpAllStr Substring( end_n end_nums end_n ); if ( sTotalOne IndexOf( // ) < ) { if ( sTotalOne IndexOf( / ) == ) { sTotalOne = urlch + sTotalOne; } else { int linshiIntNum = ; int flags = ; string urlChange = urlStr;; while( sTotalOne IndexOf( / ) >= ) { sTotalOne = sTotalOne Substring( sTotalOne IndexOf( / ) + ); linshiIntNum = linshiIntNum + ; flags = flags + ; } while( ( urlChange LastIndexOf( / ) >= ) && ( linshiIntNum >= ) ) { urlChange = urlChange Substring( urlChange LastIndexOf( / ) ); linshiIntNum = linshiIntNum ; } if ( flags == ) { sTotalOne = urlch + / + sTotalOne; } else { sTotalOne = urlChange + / + sTotalOne; } } } reUTStr = reUTStr + new PublicFun() RemoveHtmlCode( reTitle ) + sTotalOne; tmpAllStr = tmpAllStr Substring( end_nums + ); begin_nums = tmpAllStr IndexOf( ); } else { begin_nums = ; } } } return reUTStr; } catch( Exception e) { return ; } } 得到要抓取内容的url后 处理该页面 //获取链接内容并分类处理 public string GetWebContent( string gatherUrl string subUrl string subTitle string b_Content string e_Content string b_Filter string e_Filter string root ) { string tmpAllStr = ; string dfStrB = ; string dfStrE = ; string rePicStr = ;//图片返回路径 string reContentStr = ; string picHtml = images ; //本地图片路径 string urlch = ; string urlch = ; int pos = gatherUrl IndexOf( ); int pos = gatherUrl LastIndexOf( / ); if( pos < ) { return ; } if( pos < ) { return ; } int pos = gatherUrl IndexOf( / pos ); if ( pos < ) { urlch = gatherUrl; urlch = gatherUrl; } else { urlch = gatherUrl Substring( pos ); urlch = gatherUrl Substring( pos ); } tmpAllStr = new PublicFun() Get_Http( subUrl time ); //取稿源 string docFromStr = ; if ( dfStrB != && dfStrE != ) { if ( tmpAllStr != ) { int b_docF = tmpAllStr IndexOf( dfStrB ); if ( b_docF > ) { int e_docF = tmpAllStr IndexOf( dfStrE b_docF + dfStrB Length ); if ( e_docF > && e_docF > b_docF && e_docF b_docF < ) { docFromStr = tmpAllStr Substring( b_docF + dfStrB Length e_docF b_docF dfStrB Length ); } } } } //取内容 if ( tmpAllStr != ) { int begin_strnum = tmpAllStr IndexOf( b_Content ); if ( begin_strnum < ) { return ; } int end_strnum = tmpAllStr IndexOf( e_Content begin_strnum + b_Content Length ); if ( end_strnum < ) { return ; } string sTotalSubM = ; if ( end_strnum > begin_strnum ) { sTotalSubM = tmpAllStr Substring ( begin_strnum end_strnum begin_strnum ); } if ( sTotalSubM == ) { return ; } //过滤无用信息 int bfnum = sTotalSubM IndexOf( b_Filter ); if ( bfnum > ) { int efnum = sTotalSubM IndexOf( e_Filter bfnum ); if ( efnum > ) { if ( efnum > bfnum ) { sTotalSubM = sTotalSubM Substring( bfnum ) + sTotalSubM Substring( efnum + e_Filter Length ); } } } //格式化图片标记 while( sTotalSubM IndexOf( Src= ) >= ) { sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( Src= ) ) + src= + sTotalSubM Substring( sTotalSubM IndexOf( Src= ) + ); } while( sTotalSubM IndexOf( SRC= ) >= ) { sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( SRC= ) ) + src= + sTotalSubM Substring( sTotalSubM IndexOf( SRC= ) + ); } while( sTotalSubM IndexOf( src= ) >= ) { sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( src= ) ) + src= + sTotalSubM Substring( sTotalSubM IndexOf( src= ) + ); } //取图片地址 int end_n = ; int end_nums = ; int begin_nums = sTotalSubM IndexOf( src= ); while( begin_nums >= ) { String tmpStr = sTotalSubM Substring( begin_nums + ); if ( tmpStr == ) { end_n = begin_nums + ; } else { end_n = begin_nums + ; } int end_nums a = sTotalSubM IndexOf( end_n ); int end_nums b = sTotalSubM IndexOf( > end_n ); if ( end_nums b < ) { break; } if ( end_nums a > end_nums b ) { end_nums = end_nums b; } else { if (end_nums a< ) { end_nums = end_nums b; } else { end_nums = end_nums a; } } tmpStr = sTotalSubM Substring( end_nums ); if ( tmpStr == || tmpStr == ) { end_nums = end_nums ; } string tmpPicStr = sTotalSubM Substring( end_n end_nums end_n ); if ( tmpPicStr IndexOf( // ) < ) { if ( tmpPicStr IndexOf( / ) == ) { tmpPicStr = urlch + tmpPicStr; } else { int linshiIntNum = ; int flags = ; string urlChange = subUrl; while( tmpPicStr IndexOf( / ) >= ) { tmpPicStr = tmpPicStr Substring( tmpPicStr IndexOf( / ) + ); linshiIntNum = linshiIntNum + ; flags = flags + ; } while( ( urlChange LastIndexOf( / ) >= ) && ( linshiIntNum >= ) ) { urlChange = urlChange Substring( urlChange LastIndexOf( / ) ); linshiIntNum = linshiIntNum ; } if ( flags == ) { tmpPicStr = urlch + / + tmpPicStr; } else { tmpPicStr = urlChange + / + tmpPicStr; } } } //tmpPicStr = tmpPicStr ToLower(); string tmpPicStrTmp = tmpPicStr ToLower(); //if ( tmpPicStr IndexOf( jpg ) > || tmpPicStr IndexOf( gif ) > || tmpPicStr IndexOf( bmp ) > ) if ( tmpPicStrTmp IndexOf( jpg ) > || tmpPicStrTmp IndexOf( gif ) > || tmpPicStrTmp IndexOf( bmp ) > ) { rePicStr = rePicStr + || + tmpPicStr ; int flagN = tmpPicStr LastIndexOf( / ); string fileN = picHtml + tmpPicStr Substring( flagN ); sTotalSubM = sTotalSubM Substring( end_nums ) + >****** + fileN + ******< + sTotalSubM Substring( end_nums ); begin_nums = sTotalSubM IndexOf( src= end_nums + fileN Length + ); } else { begin_nums = sTotalSubM IndexOf( src= end_nums + ); } } if ( rePicStr Length > ) rePicStr = rePicStr Substring( ); //内容处理 格式化关键标记 while( sTotalSubM IndexOf( <P ) >= ) { sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( <P ) ) + |****|< + sTotalSubM Substring( sTotalSubM IndexOf( <P ) + ); } while( sTotalSubM IndexOf( <p ) >= ) { sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( <p ) ) + |****|< + sTotalSubM Substring( sTotalSubM IndexOf( <p ) + ); } while( sTotalSubM IndexOf( </P ) >= ) { sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( </P ) ) + |****|< + sTotalSubM Substring( sTotalSubM IndexOf( </P ) + ); } while( sTotalSubM IndexOf( </p ) >= ) { sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( </p ) ) + |****|< + sTotalSubM Substring( sTotalSubM IndexOf( </p ) + ); } while( sTotalSubM IndexOf( <br ) >= ) { sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( <br ) ) + +****+< + sTotalSubM Substring( sTotalSubM IndexOf( <br ) + ); } while( sTotalSubM IndexOf( <BR ) >= ) { sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( <BR ) ) + +****+< + sTotalSubM Substring( sTotalSubM IndexOf( <BR ) + ); } while( sTotalSubM IndexOf( <Br ) >= ) { sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( <Br ) ) + +****+< + sTotalSubM Substring( sTotalSubM IndexOf( <Br ) + ); } while( sTotalSubM IndexOf( <bR ) >= ) { sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( <bR ) ) + +****+< + sTotalSubM Substring( sTotalSubM IndexOf( <bR ) + ); } //去除标记 int linshiInt = sTotalSubM IndexOf( < ); int linshiInt = sTotalSubM IndexOf( > ); if ( linshiInt < linshiInt ) { sTotalSubM = sTotalSubM Substring( linshiInt + ); } int linshiInt = sTotalSubM LastIndexOf( < ); int linshiInt = sTotalSubM LastIndexOf( > ); if ( linshiInt < linshiInt ) { sTotalSubM = sTotalSubM Substring( linshiInt + ); } linshiInt = sTotalSubM IndexOf( < ); while ( linshiInt >= ) { linshiInt = sTotalSubM IndexOf( > linshiInt ); if ( linshiInt >= ) { sTotalSubM = sTotalSubM Substring( linshiInt ) + sTotalSubM Substring( linshiInt + ); } else { sTotalSubM = sTotalSubM Substring( linshiInt ); } linshiInt = sTotalSubM IndexOf( < ); } //还原关键标记 int linshiInt = ; int linshiInt = ; while( sTotalSubM IndexOf( +****+ ) >= ) { sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( +****+ ) ) + <br>n + sTotalSubM Substring( sTotalSubM IndexOf( +****+ ) + ); } while( sTotalSubM IndexOf( |****| ) >= ) { sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( |****| ) ) + <br>n + sTotalSubM Substring( sTotalSubM IndexOf( |****| ) + ); } while( sTotalSubM IndexOf( ****** ) >= ) { linshiInt = sTotalSubM IndexOf( ****** ) + ; linshiInt = sTotalSubM IndexOf( ****** linshiInt ); if ( linshiInt >= ) { int tmpPos = sTotalSubM IndexOf( ****** ); string tmpStr = sTotalSubM Substring( tmpPos ); string tmpStr = sTotalSubM Substring( linshiInt linshiInt linshiInt ); string tmpStr = sTotalSubM Substring( linshiInt + ); sTotalSubM = tmpStr + <img src= + tmpStr + > + tmpStr ; } else { break; } } //去除内容中的标题 if ( sTotalSubM IndexOf( subTitle ) >= ) { sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( subTitle ) ) + sTotalSubM Substring( sTotalSubM IndexOf( subTitle ) + subTitle Length ); } reContentStr = sTotalSubM; //调用下载图片功能 //下载图片到指定目录 string[] img_Url = new PublicFun() split( rePicStr || ); for ( int i= ;i<img_Url Length;i++ ) { if ( img_Url[i] != ) { new PublicFun() Get_Img( img_Url[i] root + \images\ + img_Url[i] Substring( img_Url[i] LastIndexOf( / )+ ) ); } } } return reContentStr; } 以上方法返回要取得的信息 包括标题内容 图片地址等 下载页面中图片 //下载图片 public void Get_Img(string a_strUrl int timeout string filepath) { try { HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest Create(a_strUrl) ; myReq Timeout = timeout; HttpWebResponse HttpWResp = (HttpWebResponse)myReq GetResponse(); Stream myStream = HttpWResp GetResponseStream () ; Bitmap map = new Bitmap( myStream ); PictureBox picB = new PictureBox(); picB Image = (Image)map; string path = filepath Substring( filepath LastIndexOf( \ ) ); if (!Directory Exists(path)) { CreateDir( path ); } picB Image Save(filepath); } catch(Exception exp) { string ss = exp Message; WriteLog( filepath Substring( filepath LastIndexOf( \ )) + \error log a_strUrl + + ss + rn ); } } 保存文件或入库 上面取得的信息可以按自己的要求保存 ****设计的时候没有使用url按层次循环抓取 这样定义抓取url效率更高 速度更快 测试程序下载 如有建议请发送EMail 或msn 注 此版本只提供静态文件存储功能 不提供数据库接口 不提供自定义网站功能 本程序运行需要先安 框架 lishixinzhi/Article/program/net/201311/12401
很赞哦! (1067)