最佳提取HTTP请求的方法，然后处理HTML响应。

public function curlInit($connId, $url , $postString){ $this->multiConn[$connId] = curl_init($url); curl_setopt($this->multiConn[$connId], CURLOPT_RETURNTRANSFER, true); curl_setopt($this->multiConn[$connId], CURLOPT_HEADER, false); curl_setopt($this->multiConn[$connId], CURLOPT_ENCODING, 'gzip,deflate'); curl_setopt($this->multiConn[$connId],CURLOPT_POSTFIELDS, $postString); }

public function multiExec(){ $this->multiAddHandler(); $running = null; do { curl_multi_exec($this->multiHanler, $running); } while ($running); $i = 0; foreach($this->multiConn as $key => $v){ $this->multiRespose[$key] = curl_multi_getcontent($v); } }

public function parse(){ // first fetch & process the categories $this->getCategoryElementList("li"); // then fetch all events on each category. $this->curl->multiExec(); // parse the events response $this->processEvents(); // after that i have to parse the details of each event. }

public function getCategoryElementList($tag){ foreach($this->categoryIdsArr as $group){ $domElement = $this->getElementById($group); $catList = $domElement->childNodes; $this->categoryElementList[] = $catList; foreach($catList as $cat){ // temp Var , to check if the subcat_id is autogenerated, so don't init a curl connection for it $autoGenSubCatIds = array(); if($cat->nodeName == 'li'){ // -- Getting the category name -- // $catNameSapn = $this->searchElement($cat, "span", "class", "nav-special-name"); if(empty($catNameSapn->item(0)->nodeValue)){ $catNameSapn = $this->searchElement($cat, "span", "class", "nav-region-name"); } if(isset($catNameSapn->item(0)->nodeValue)){ $_categoryName = $catNameSapn->item(0)->nodeValue; } // autogenerate subcat_id if not exists if($catNameSapn->item(0)->childNodes->item(0)->nodeName == 'a'){ $aTag = $catNameSapn->item(0)->childNodes->item(0)->getAttribute("href"); $aTag = split("/",$aTag); $_categoryId = $aTag[4]; }elseif($catNameSapn->item(0)->childNodes->item(0)->nodeName == '#text'){ $tempId = 0; array_walk(str_split($_categoryName), function($value, $index) use (&$tempId){ $tempId += ord($value); }); $_categoryId = ($tempId); $autoGenSubCatIds[] = $tempId; } // -- End getting the category name -- // $this->arrRes['category'][$_categoryId]['category_name'] = $_categoryName; $this->arrRes['category'][$_categoryId]['category_id'] = $_categoryId; $subCats = $cat->getElementsByTagName("a"); foreach($subCats as $subCat){ $_subCategroyName = $subCat->nodeValue; $aTag = $subCat->getAttribute("href"); $aTag = split("/",$aTag); $_subCategoryId = $aTag[4]; $this->arrRes['category'][$_categoryId]['subcat'][$_subCategoryId]['subcat_name'] = $_subCategroyName; $this->arrRes['category'][$_categoryId]['subcat'][$_subCategoryId]['subcat_id'] = $_subCategoryId; if(!in_array($_subCategoryId, $autoGenSubCatIds)) $this->curl->curlInit($_subCategoryId, "https://********************.com", "Ids=$_subCategoryId&stId=4&page=0"); } } } } }

1条回答

网友

1楼 · 发布于 2024-10-03 19:23:33

您描述的过程主要包括三个部分：

从internet获取html文档
解析html
将解析结果存储在数据库中

过程的第一部分主要取决于您的网络连接、服务器的响应时间等。无论您使用何种语言/技术，都不会有太大的区别，因为大部分时间都花在低级系统调用和网络延迟上。你知道吗

第三部分也主要是关于数据库引擎的性能，即使您可以在这里或那里进行一些优化（如何构建查询、如何管理事务、是否保持持久连接或每次重新连接等）。你知道吗

这就剩下HTML解析部分了。如果它是格式良好的HTML，那么您的语言可以使用两个用C实现的非常优化的解析器（即Python对libxml2和libxslt的lxml绑定）。在编写代码的方式上也有一些可能的优化（一般来说，不是说我没有读过的上面的代码片段），但是如何做到这一点取决于您使用的确切语言/技术。你知道吗

现在的重点是，无论解析器和您自己的代码如何优化，您仍将受制于网络和数据库性能。在这里获得更好性能的唯一方法是尽可能多地并行化进程，在尽可能多的节点上使用HTTP客户机，根据需要在尽可能多的节点上提供解析器，自己为数据库提供数据。你知道吗

我们在Python应用程序中解决了一个非常类似的问题，使用芹菜和RabbitMQ进行并行化，在4个不同的“worker”节点上平衡负载（另外一个用于数据库，另一个用于Django/apache前端），但在大多数语言/技术中都有同样好甚至更好的解决方案（MapReduce any？）。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章