您现在的位置是:首页 >

如何利用web 利用C#实现web信息自动抓取

火烧 2023-01-19 06:52:33 1067
利用C#实现we 信息自动抓取 背景  随着I ter et的普及 网络信息正以极高的速度增长 在这么多数据中找到自己需要的信息是一件很繁琐的事情 找到需要的信息后如何获取也是件麻烦的事 这就需要I

利用C#实现web信息自动抓取  

如何利用web 利用C#实现web信息自动抓取
背景   随着Internet的普及 网络信息正以极高的速度增长 在这么多数据中找到自己需要的信息是一件很繁琐的事情 找到需要的信息后如何获取也是件麻烦的事 这就需要Internet信息抓取程序来代替人工的操作 所谓Internet信息抓取程序 就是程序会按照用户的关键词或关键网站来收集相应的信息 并提供给用户想要的信息格式    信息量的增加会带来信息网站发布人员工作量的剧增 为实现信息发布系统实现信息自 动发布 减少工作人员工作量 即时跟踪最新信息 就需要自动信息提供程序 因此Internet信息抓取程序应运而生   目标   实现自定义网站信息分类抓取 存入本地数据库 生成静态页面或其它用户定义的信息结构 并下载与信息相关的多媒体文件   开发 目标站点结构分析 本步骤是准确抓取信息个关键 首先要选择更新频率高的页面做为抓取地址 然后分析要抓取内容页面url特点 然后分析要抓取信息页面的元素特性 比如标题位置 内容位置 等 得到定位标记点 将以上信息写成自己的配置文件或存到数据库中 每个网站都需要分析 写出单独的配置文件 供抓取程序使用  信息提取 根据配置文件取得要抓取页面url 使用HttpWebRequest类获取内容 //获取页面函数         public string Get_Http(string a_strUrl int timeout)         {             string strResult ;                       try             { HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest Create(a_strUrl) ;                 myReq Timeout = timeout;                 HttpWebResponse HttpWResp = (HttpWebResponse)myReq GetResponse();                             Stream myStream = HttpWResp GetResponseStream () ;                   StreamReader sr = new StreamReader(myStream Encoding Default);                 StringBuilder strBuilder = new StringBuilder();                 while ( != sr Peek())                 {                     strBuilder Append(sr ReadLine()+ rn );                 }                   strResult = strBuilder ToString();             }             catch(Exception exp)             {                 strResult = 错误 + exp Message ;             }               return strResult ;           } 获取页面内容后 分析页面中连接地址取到要抓取的url //处理页面标题和链接         public string SniffWebUrl( string urlStr string blockB string blockE )         {                   string urlch = ;             string urlch = ;                                int end_n = ;             int end_nums = ;             int end_nums = ;             int end_nums = ;             int end_nums      = ;                        string reUTStr = ;             string reTitle = ;             string ret = ;                       try             {                 int pos = urlStr IndexOf( );                 int pos = urlStr LastIndexOf( / );                 if( pos < )                 {                     return ;                 }                 if( pos < )                 {                     return ;                 }                 int pos = urlStr IndexOf( / pos );                 if ( pos < )                 {                     urlch = urlStr;                     urlch = urlStr;                 }                 else                 {                     urlch = urlStr Substring( pos );                     urlch = urlStr Substring( pos );                 }                   string tmpAllStr = new PublicFun() Get_Http( urlStr time );                   int pos = tmpAllStr IndexOf( blockB );                 int pos = tmpAllStr IndexOf( blockE pos + blockB Length );                 if ( pos > && pos > && pos >pos )                 {                     ret = tmpAllStr Substring( pos + blockB Length pos pos blockB Length );                     ret = ret Substring( ret IndexOf( < ));                     while( ret IndexOf( <A ) >= )                     {                         ret = ret Substring( ret IndexOf( <A ) ) + <a + ret Substring( ret IndexOf( <A ) + );                     }                     while( ret IndexOf( </A ) >= )                     {                         ret = ret Substring( ret IndexOf( </A ) ) + </a + ret Substring( ret IndexOf( </A ) + );                     }                     while( ret IndexOf( Href= ) >= )                     {                         ret = ret Substring( ret IndexOf( Href= )) + + ret Substring( ret IndexOf( Href= ) + );                     }                     while( ret IndexOf( HREF= ) >= )                     {                         ret = ret Substring( ret IndexOf( HREF= )) + + ret Substring( ret IndexOf( HREF= ) + );                     }                     while( ret IndexOf( ) >= )                     {                         ret = ret Substring( ret IndexOf( )) + href= + ret Substring( ret IndexOf( ) + );                     }                 }                       tmpAllStr = ret;                      int begin_nums = tmpAllStr IndexOf( );                   while ( begin_nums >= )                 {                                   string tmpStrA = ;                     string tmpStrB = tmpAllStr Substring( begin_nums + );                     if ( tmpStrB == )                     {                         end_n = begin_nums + ;                         if ( ( end_n + ) > tmpAllStr Length )                         {                             return ;                         }                         tmpStrA = tmpAllStr Substring( begin_nums+ );                     }                     else                     {                         end_n = begin_nums + ;                         tmpStrA = tmpStrB;                     }                       if ( tmpStrA == # )                     {                         tmpAllStr = tmpAllStr Substring( end_n );                         begin_nums = tmpAllStr IndexOf( );                     }                     else                     {                                           end_nums = tmpAllStr IndexOf( end_n );                         end_nums = tmpAllStr IndexOf( > end_n );                         end_nums = tmpAllStr IndexOf( </a end_nums );                           if ( ( end_nums >= ) && ( end_nums >= ) )                         {                             reTitle = tmpAllStr Substring( end_nums + end_nums end_nums );                               if ( end_nums > end_nums )                             {                                 end_nums = end_nums ;                             }                             else                             {                                 if ( end_nums < )                                 {                                     end_nums = end_nums ;                                 }                                 else                                 {                                     end_nums = end_nums ;                                 }                             }                             string str = tmpAllStr Substring( end_nums end_nums end_nums + );                               if ( str ==  || str == )                             {                                 end_nums = end_nums ;                             }                             string sTotalOne = tmpAllStr Substring( end_n end_nums end_n );                               if ( sTotalOne IndexOf( // ) < )                             {                                 if ( sTotalOne IndexOf( / ) == )                                 {                                     sTotalOne = urlch + sTotalOne;                                 }                                 else                                 {                                                                   int linshiIntNum = ;                                     int flags = ;                                     string urlChange = urlStr;;                                     while( sTotalOne IndexOf( / ) >= )                                     {                                         sTotalOne = sTotalOne Substring( sTotalOne IndexOf( / ) + );                                         linshiIntNum = linshiIntNum + ;                                         flags = flags + ;                                     }                                     while( ( urlChange LastIndexOf( / ) >= ) && ( linshiIntNum >= ) )                                     {                                         urlChange = urlChange Substring( urlChange LastIndexOf( / ) );                                         linshiIntNum = linshiIntNum ;                                     }                                     if ( flags == )                                     {                                         sTotalOne = urlch + / + sTotalOne;                                     }                                     else                                     {                                         sTotalOne = urlChange + / + sTotalOne;                                     }                                 }                             }                             reUTStr = reUTStr + new PublicFun() RemoveHtmlCode( reTitle ) + sTotalOne;                               tmpAllStr = tmpAllStr Substring( end_nums + );                             begin_nums = tmpAllStr IndexOf( );                         }                         else                         {                             begin_nums = ;                         }                                        }                 }                 return reUTStr;             }             catch( Exception e)             {                 return ;             }         }   得到要抓取内容的url后 处理该页面 //获取链接内容并分类处理         public string GetWebContent( string gatherUrl string subUrl string subTitle string b_Content string e_Content string b_Filter string e_Filter string root )         {             string tmpAllStr = ;                        string dfStrB = ;             string dfStrE = ;                            string rePicStr = ;//图片返回路径                string reContentStr = ;             string picHtml = images ; //本地图片路径                         string urlch = ;             string urlch = ;             int pos = gatherUrl IndexOf( );             int pos = gatherUrl LastIndexOf( / );             if( pos < )             {                 return ;             }             if( pos < )             {                                return ;             }             int pos = gatherUrl IndexOf( / pos );             if ( pos < )             {                 urlch = gatherUrl;                 urlch = gatherUrl;             }             else             {                 urlch = gatherUrl Substring( pos );                 urlch = gatherUrl Substring( pos );             }                            tmpAllStr = new PublicFun() Get_Http( subUrl time );             //取稿源             string docFromStr = ;             if ( dfStrB != && dfStrE != )             {                 if ( tmpAllStr != )                 {                     int b_docF = tmpAllStr IndexOf( dfStrB );                     if ( b_docF > )                     {                         int e_docF = tmpAllStr IndexOf( dfStrE b_docF + dfStrB Length );                         if ( e_docF > && e_docF > b_docF && e_docF b_docF < )                         {                             docFromStr = tmpAllStr Substring( b_docF + dfStrB Length e_docF b_docF dfStrB Length );                         }                     }                 }             }             //取内容             if ( tmpAllStr != )             {                                int begin_strnum = tmpAllStr IndexOf( b_Content );                 if ( begin_strnum < )                 {                                       return ;                 }                 int end_strnum = tmpAllStr IndexOf( e_Content begin_strnum + b_Content Length );                 if ( end_strnum < )                 {                                       return ;                 }                 string sTotalSubM = ;                 if ( end_strnum > begin_strnum )                 {                     sTotalSubM = tmpAllStr Substring ( begin_strnum end_strnum begin_strnum );                 }                                 if ( sTotalSubM == )                 {                                       return ;                 }                               //过滤无用信息                 int bfnum = sTotalSubM IndexOf( b_Filter );                 if ( bfnum > )                 {                     int efnum = sTotalSubM IndexOf( e_Filter bfnum );                     if ( efnum > )                     {                         if ( efnum > bfnum )                         {                             sTotalSubM = sTotalSubM Substring( bfnum ) + sTotalSubM Substring( efnum + e_Filter Length );                         }                     }                 }                 //格式化图片标记                                 while( sTotalSubM IndexOf( Src= ) >= )                 {                     sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( Src= ) ) + src= + sTotalSubM Substring( sTotalSubM IndexOf( Src= ) + );                 }                 while( sTotalSubM IndexOf( SRC= ) >= )                 {                     sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( SRC= ) ) + src= + sTotalSubM Substring( sTotalSubM IndexOf( SRC= ) + );                 }                 while( sTotalSubM IndexOf( src= ) >= )                 {                     sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( src= ) ) + src= + sTotalSubM Substring( sTotalSubM IndexOf( src= ) + );                 }                   //取图片地址                 int end_n = ;                 int end_nums = ;                 int begin_nums = sTotalSubM IndexOf( src= );                 while( begin_nums >= )                 {                     String tmpStr = sTotalSubM Substring( begin_nums + );                     if ( tmpStr == )                     {                         end_n = begin_nums + ;                     }                     else                     {                         end_n = begin_nums + ;                     }                     int end_nums a = sTotalSubM IndexOf( end_n );                     int end_nums b = sTotalSubM IndexOf( > end_n );                     if ( end_nums b < )                     {                         break;                     }                     if ( end_nums a > end_nums b )                     {                         end_nums = end_nums b;                     }                     else                     {                         if (end_nums a< )                         {                             end_nums = end_nums b;                         }                         else                         {                             end_nums = end_nums a;                         }                     }                     tmpStr = sTotalSubM Substring( end_nums );                     if ( tmpStr == || tmpStr == )                     {                         end_nums = end_nums ;                     }                     string tmpPicStr = sTotalSubM Substring( end_n end_nums end_n );                       if ( tmpPicStr IndexOf( // ) < )                     {                         if ( tmpPicStr IndexOf( / ) == )                         {                             tmpPicStr = urlch + tmpPicStr;                         }                         else                         {                                                        int linshiIntNum = ;                             int flags = ;                             string urlChange = subUrl;                             while( tmpPicStr IndexOf( / ) >= )                             {                                 tmpPicStr = tmpPicStr Substring( tmpPicStr IndexOf( / ) + );                                 linshiIntNum = linshiIntNum + ;                                 flags = flags + ;                             }                             while( ( urlChange LastIndexOf( / ) >= ) && ( linshiIntNum >= ) )                             {                                 urlChange = urlChange Substring( urlChange LastIndexOf( / ) );                                 linshiIntNum = linshiIntNum ;                             }                             if ( flags == )                             {                                 tmpPicStr = urlch + / + tmpPicStr;                             }                             else                             {                                 tmpPicStr = urlChange + / + tmpPicStr;                             }                         }                     }                     //tmpPicStr = tmpPicStr ToLower();                     string tmpPicStrTmp = tmpPicStr ToLower();                     //if ( tmpPicStr IndexOf( jpg ) > || tmpPicStr IndexOf( gif ) > || tmpPicStr IndexOf( bmp ) > )                     if ( tmpPicStrTmp IndexOf( jpg ) > || tmpPicStrTmp IndexOf( gif ) > || tmpPicStrTmp IndexOf( bmp ) > )                     {                         rePicStr = rePicStr + || + tmpPicStr ;                           int flagN = tmpPicStr LastIndexOf( / );                         string fileN = picHtml + tmpPicStr Substring( flagN );                         sTotalSubM = sTotalSubM Substring( end_nums ) + >****** + fileN + ******< + sTotalSubM Substring( end_nums );                           begin_nums = sTotalSubM IndexOf( src= end_nums + fileN Length + );                     }                     else                     {                         begin_nums = sTotalSubM IndexOf( src= end_nums + );                                            }                                   }                 if ( rePicStr Length > )                      rePicStr = rePicStr Substring( );                                 //内容处理 格式化关键标记                 while( sTotalSubM IndexOf( <P ) >= )                 {                     sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( <P ) ) + |****|< + sTotalSubM Substring( sTotalSubM IndexOf( <P ) + );                 }                 while( sTotalSubM IndexOf( <p ) >= )                 {                     sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( <p ) ) + |****|< + sTotalSubM Substring( sTotalSubM IndexOf( <p ) + );                 }                 while( sTotalSubM IndexOf( </P ) >= )                 {                     sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( </P ) ) + |****|< + sTotalSubM Substring( sTotalSubM IndexOf( </P ) + );                 }                 while( sTotalSubM IndexOf( </p ) >= )                 {                     sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( </p ) ) + |****|< + sTotalSubM Substring( sTotalSubM IndexOf( </p ) + );                 }                 while( sTotalSubM IndexOf( <br ) >= )                 {                     sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( <br ) ) + +****+< + sTotalSubM Substring( sTotalSubM IndexOf( <br ) + );                 }                 while( sTotalSubM IndexOf( <BR ) >= )                 {                     sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( <BR ) ) + +****+< + sTotalSubM Substring( sTotalSubM IndexOf( <BR ) + );                 }                 while( sTotalSubM IndexOf( <Br ) >= )                 {                     sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( <Br ) ) + +****+< + sTotalSubM Substring( sTotalSubM IndexOf( <Br ) + );                 }                 while( sTotalSubM IndexOf( <bR ) >= )                 {                     sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( <bR ) ) + +****+< + sTotalSubM Substring( sTotalSubM IndexOf( <bR ) + );                 }                   //去除标记                 int linshiInt = sTotalSubM IndexOf( < );                 int linshiInt = sTotalSubM IndexOf( > );                              if ( linshiInt < linshiInt )                 {                     sTotalSubM = sTotalSubM Substring( linshiInt + );                 }                 int linshiInt = sTotalSubM LastIndexOf( < );                 int linshiInt = sTotalSubM LastIndexOf( > );                 if ( linshiInt < linshiInt )                 {                     sTotalSubM = sTotalSubM Substring( linshiInt + );                 }                 linshiInt = sTotalSubM IndexOf( < );                 while ( linshiInt >= )                 {                     linshiInt = sTotalSubM IndexOf( > linshiInt );                     if ( linshiInt >= )                     {                                       sTotalSubM = sTotalSubM Substring( linshiInt ) + sTotalSubM Substring( linshiInt + );                     }                     else                     {                         sTotalSubM = sTotalSubM Substring( linshiInt );                     }                     linshiInt = sTotalSubM IndexOf( < );                 }                   //还原关键标记                 int linshiInt = ;                 int linshiInt = ;                   while( sTotalSubM IndexOf( +****+ ) >= )                 {                     sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( +****+ ) ) + <br>n + sTotalSubM Substring( sTotalSubM IndexOf( +****+ ) + );                 }                 while( sTotalSubM IndexOf( |****| ) >= )                 {                     sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( |****| ) ) + <br>n + sTotalSubM Substring( sTotalSubM IndexOf( |****| ) + );                 }                 while( sTotalSubM IndexOf( ****** ) >= )                 {                     linshiInt = sTotalSubM IndexOf( ****** ) + ;                     linshiInt = sTotalSubM IndexOf( ****** linshiInt );                     if ( linshiInt >= )                     {                         int tmpPos = sTotalSubM IndexOf( ****** );                         string tmpStr = sTotalSubM Substring( tmpPos );                         string tmpStr = sTotalSubM Substring( linshiInt linshiInt linshiInt );                         string tmpStr = sTotalSubM Substring( linshiInt + );                         sTotalSubM = tmpStr + <img src= + tmpStr + > + tmpStr ;                     }                     else                     {                         break;                     }                 }                 //去除内容中的标题                 if ( sTotalSubM IndexOf( subTitle ) >= )                 {                     sTotalSubM = sTotalSubM Substring( sTotalSubM IndexOf( subTitle ) ) + sTotalSubM Substring( sTotalSubM IndexOf( subTitle ) + subTitle Length );                 }                 reContentStr = sTotalSubM;                 //调用下载图片功能                 //下载图片到指定目录                 string[] img_Url = new PublicFun() split( rePicStr || );                 for ( int i= ;i<img_Url Length;i++ )                 {                     if ( img_Url[i] != )                     {                         new PublicFun() Get_Img( img_Url[i] root + \images\ + img_Url[i] Substring( img_Url[i] LastIndexOf( / )+ ) );                     }                 }             }             return reContentStr;           } 以上方法返回要取得的信息 包括标题内容 图片地址等   下载页面中图片 //下载图片         public void Get_Img(string a_strUrl int timeout string filepath)         {             try             { HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest Create(a_strUrl) ;                 myReq Timeout = timeout;                 HttpWebResponse HttpWResp = (HttpWebResponse)myReq GetResponse();                         Stream myStream = HttpWResp GetResponseStream () ;                          Bitmap map = new Bitmap( myStream );                 PictureBox picB = new PictureBox();                 picB Image = (Image)map;                 string path = filepath Substring( filepath LastIndexOf( \ ) );                 if (!Directory Exists(path))                 {                     CreateDir( path );                 }                               picB Image Save(filepath);                            }             catch(Exception exp)             {                 string ss = exp Message;     WriteLog( filepath Substring( filepath LastIndexOf( \ )) + \error log a_strUrl + + ss + rn );                 }         }   保存文件或入库 上面取得的信息可以按自己的要求保存   ****设计的时候没有使用url按层次循环抓取 这样定义抓取url效率更高 速度更快     测试程序下载   如有建议请发送EMail 或msn   注 此版本只提供静态文件存储功能 不提供数据库接口 不提供自定义网站功能     本程序运行需要先安 框架 lishixinzhi/Article/program/net/201311/12401  
永远跟党走
  • 如果你觉得本站很棒,可以通过扫码支付打赏哦!

    • 微信收款码
    • 支付宝收款码