Sponsored Links

Go Back   WebMasters.org Forums > Search Engine Forums > Google Forum

Reply
 
LinkBack Thread Tools Display Modes
  #1 (permalink)  
Old 03-13-2008, 05:10 PM
smartcoderin smartcoderin is offline
Junior Member
 
Join Date: Mar 2008
Posts: 16
Exclamation How to Scrap Google

Do you ever think about scrapping google results ,,,


It is possible
TRy this




<?php


interface Search_Interface
{


public function setRecordLimit($recordLimit);

public function setSearchKeyword($searchKeyword);

public function getWebpageContent($pageUrl);

public function parseWebpage($content);


}




<?php


/*
************************************************** **************************
* @ Written by smart_coder@yahoo.co.in
* @ Usage only for learning the concept
* @ Tested : under normal conditions
************************************************** **************************
*/

//error_reporting(E_ALL);


class Google_Engine implements Search_Interface
{
public $searchKeyword = null;
public $searchUrl = null;
public $pageContent = null;
public $linkText = array();
public $linkUrl = array();
public $maxRecord = 0;
public $googleUrl = null;
public $curlOptions = array();
public $googleLinks = array();
public $searchRecord = 0;
public $parseInfo = null;

/**
* @Constructor Method
* @Initialize the parameters
*/

function __construct($maxRecord,$searchKeyword)
{
$this->googleUrl = "http://www.google.co.in/search?q=keyword&hl=en&sa=N";

$this->curlOptions = array(
CURLOPT_RETURNTRANSFER => true, // return web page
CURLOPT_HEADER => false, // don't return headers
CURLOPT_FOLLOWLOCATION => true, // follow redirects
CURLOPT_ENCODING => "", // handle all encodings
CURLOPT_USERAGENT => "spider", // who am i
CURLOPT_AUTOREFERER => true, // set referer on redirect
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
CURLOPT_TIMEOUT => 120, // timeout on response
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
);

$this->setRecordLimit($maxRecord);
$this->setSearchKeyword($searchKeyword);

}

/**
* @Set Max number od records to be searched
* @Default will be zero
*/

function setRecordLimit($maxRecord)
{
if(!is_numeric($maxRecord))
{
$this->maxRecord = 0;
}
else if(empty($maxRecord))
{
$this->maxRecord = 0;
}
else
{
$this->maxRecord = $maxRecord;
}

}

/**
* @Set keyword to be searched
* @If not set default will be null
*/


function setSearchKeyword($searchKeyword)
{
if(empty($searchKeyword))
{
$this->searchKeyword = null;
}
else
{
$this->searchKeyword = urlencode($searchKeyword);
}
}
/*
* @Error Handler
*/
function raiseError($errorString)
{
trigger_error($errorString,E_USER_WARNING);
die("<br><b>Google Search Halted</b>");
}

/**
* @ Searching google search engine
*/

function searchGoogle()
{
if(empty($this->searchKeyword))
{
$this->raiseError("Search Keyword Empty");
}
else if($this->maxRecord==0)
{
$this->raiseError("Nothing to do: max record is zero");
}
else
{
$this->searchUrl = str_replace("keyword",$this->searchKeyword,$this->googleUrl);
//Loop through pages //

$pageLimit = ceil($this->maxRecord/10);
$this->parseInfo.= "\n-----Starting Scrap-----\n";

for($i=1;$i<=$pageLimit;$i++)
{
$this->parseInfo.= "\n\n-----Page $i-----\n\n";

$pageStart = ($i-1)*10;
$this->searchUrl = $this->searchUrl."&start=".$pageStart;
$this->getWebpageContent($this->searchUrl); //read the webpage //
$this->parseWebpage($this->pageContent); //page content //
}
$this->parseInfo.= "\n-----Records".$this->searchRecord."-----\n";
$this->parseInfo.= "\n-----End Scrap----\n";

//report scritp//
$fp = fopen("Log-".date("Y-m-d").".txt","a");
fwrite($fp,$this->parseInfo);
fclose($fp);
}
}

/**
* @ Crawl google result page
*/

function getWebpageContent($pageUrl)
{
$curlCon = curl_init($this->searchUrl);
curl_setopt_array( $curlCon, $this->curlOptions );
$pageContent = curl_exec( $curlCon );
$errNo = curl_errno( $curlCon );
$errMsg = curl_error( $curlCon );
$header = curl_getinfo( $curlCon );
curl_close( $curlCon );

if($errNo==0)
{
$this->pageContent = $pageContent; //content ready for parsing //
}
else
{
//connection error or curl not performed well//
$this->raiseError($errMsg);
}
}

/**
* @ Parsing webpage for result
* @ consider only main link-title-page description
*/
function parseWebpage($content)
{
if(empty($this->pageContent))
{
$this->raiseError("Webpage Content Empty");
}
else
{
$patternResult = "/<div[\sa-zA-Z0-9_.='\"]*class[\s]*=g[a-zA-Z0-9='\".]*/";
$matchItems = preg_split($patternResult,$this->pageContent);



for($i=0;$i<count($matchItems);$i++)
{
$mainUrl = null;
$mainText = null;
$matchUrl = array();

$partSplit = $matchItems[$i];
$urlPattern = "/a[\sa-zA-Z='\"._0-9]*href[\s]*=['\"]+([a-zA-Z0-9:\/._=\+\?\&#;\-\s]+)['\"]+[\sa-zA-Z0-9=+'\".]*class[\s]*=[\s\'\"0-9]*[l\s]+>([a-zA-Z0-9_.,><:;\/\s\-\|#\&]+[\s]*)[<\/]*a>/";
preg_match_all($urlPattern,$partSplit,$matchUrl);

$mainUrl = $matchUrl[1][0];
$mainText = strip_tags($matchUrl[2][0]);

$infoPattern= "/font[\s]*size[\s]*=[\s]*-1[\s]*>([a-zA-Z0-9:_.'\",;\>\<\/\-\s\&\?#'\"()\@\!\*\^\%]+)[\s\<]*span[\s]class/";

preg_match_all($infoPattern,$partSplit,$matchInfo) ;
$mainInfo = strip_tags($matchInfo[1][0]);

if(!empty($mainUrl)&&!empty($mainText))
{
$this->googleLinks[$this->searchRecord]['pageUrl'] = $mainUrl;
$this->googleLinks[$this->searchRecord]['pageTitle'] = $mainText;
$this->googleLinks[$this->searchRecord]['pageInfo'] = $mainInfo;
$this->searchRecord = $this->searchRecord+1;
}

if($this->searchRecord>$this->maxRecord)
{
break;
}


}

}
}



//end class//



}






?>




<?php


include("SearchInterface.php");
include("GoogleEngine.class.php");
$keyword = $_REQUEST['keyword'];
$limit = $_REQUEST['limit'];
if(empty($limit))
{
$limit=10;
}
if(!empty($keyword) && !empty($limit))
{
$googleObject = new Google_Engine($limit,$keyword);
$googleObject -> searchGoogle();
$links = $googleObject->googleLinks;

for($i=0;$i<count($links);$i++)
{
/*echo "<br>--------------------<br>";
echo $links[$i]['pageUrl'];
echo "<br>";
echo $links[$i]['pageTitle'];
echo "<br>";
echo $links[$i]['pageInfo'];*/

}
}




?>
<style type="text/css">
<!--
.style1 {
font-size: 24px;
color: #000099;
}
.style2 {
font-size: 16px;
font-weight: bold;
color: #666666;
}

.style3 {
font-size: 16px;
font-weight: bold;
color:#0000CC;
}
.style4 {
font-size: 16px;
font-weight:500;
color:#000000;
font-family:"Times New Roman", Times, serif;
}
-->
</style>
<table width="100%" border="0" cellspacing="0" cellpadding="0">
<tr>
<td align="center">&nbsp;</td>
</tr>
<tr>
<td height="24" align="center"><span class="style1">G-Search</span></td>
</tr>
<tr>
<td align="center"><form name="form1" method="post" action="">
<span class="style2">Search</span>&nbsp;
<input name="keyword" type="text" id="keyword" size="50" value="<?php echo $keyword?>">
<select name="limit" id="limit">
<option value="10">10</option>
<option value="20">20</option>
<option value="30">30</option>
<option value="40">40</option>
</select>
<input type="submit" name="Submit" value="Search">
</form>
</td>
</tr>
<tr>
<td align="center">&nbsp;</td>
</tr>
<tr>
<td align="center"><table width="90%" border="0" cellspacing="0" cellpadding="0">
<?php

for($i=0;$i<count($links);$i++)
{
/*echo "<br>--------------------<br>";
echo $links[$i]['pageUrl'];
echo "<br>";
echo $links[$i]['pageTitle'];
echo "<br>";
echo $links[$i]['pageInfo'];*/


?>

<tr>
<td align="left" valign="top" class="style4"><a href="<?php echo $links[$i]['pageUrl']?>" class="style3" target="_blank"><?php
echo $links[$i]['pageTitle'];

?>
</a><br>
<?php
echo $links[$i]['pageInfo'];

?><br>


</td>
</tr>
<?php
}
if(count($links)==0 && $_POST['Submit'] && !empty($keyword))
{
?>
<tr>
<td align="left" valign="top">No results </td>
</tr>
<?php
}
?>
</table></td>
</tr>
<tr>
<td align="center">&nbsp;</td>
</tr>
</table>




For more details
SmartCoderIn => Scripts and tutorials : We give online assistance



SmartCoderIn => Scripts and tutorials : We give online assistance

?>
Digg this Post!Add Post to del.icio.usBookmark Post in TechnoratiFurl this Post!
Reply With Quote
  #2 (permalink)  
Old 03-17-2008, 03:51 PM
bbalegere bbalegere is offline
Junior Member
 
Join Date: Mar 2008
Posts: 4
Default

What exactly is meant by scrapping google?
It it reading content from a google search page and displaying it on your own?
This code may work but people will definitley come to know that the site is using google's engine.
Digg this Post!Add Post to del.icio.usBookmark Post in TechnoratiFurl this Post!
Reply With Quote
  #3 (permalink)  
Old 03-18-2008, 11:04 AM
smartcoderin smartcoderin is offline
Junior Member
 
Join Date: Mar 2008
Posts: 16
Default Nice

Sure,, but the data is not presented in that way.. The data is stored in our databse, ALso we will search from most of the search engines for the same result. Then display the result when user make a search. SO data will seem to come from our site...
Digg this Post!Add Post to del.icio.usBookmark Post in TechnoratiFurl this Post!
Reply With Quote
  #4 (permalink)  
Old 05-24-2008, 08:21 PM
borsa_mat borsa_mat is offline
Member
 
Join Date: Apr 2008
Posts: 61
Default

I think it is possible to scrap google... but i dont know how it is doing..
Digg this Post!Add Post to del.icio.usBookmark Post in TechnoratiFurl this Post!
Reply With Quote
  #5 (permalink)  
Old 06-09-2008, 06:39 PM
Nathan Malone Nathan Malone is offline
Junior Member
 
Join Date: Jun 2008
Posts: 4
Default

Another way which is more in line with the TOS is to use an API to access the data. Google has one, but they stopped giving out new API keys a while back. Yahoo, however, is still giving them out...
__________________
For my AdWords / PPC Management services, please visit PPCManagementBlog.com/Services. Thanks!
Digg this Post!Add Post to del.icio.usBookmark Post in TechnoratiFurl this Post!
Reply With Quote
  #6 (permalink)  
Old 06-23-2008, 10:41 AM
sophiya sophiya is offline
Junior Member
 
Join Date: Jun 2008
Posts: 7
Default

how to scrap google scraping
__________________
seo services in India
Digg this Post!Add Post to del.icio.usBookmark Post in TechnoratiFurl this Post!
Reply With Quote
  #7 (permalink)  
Old 06-25-2008, 09:03 AM
hannahross hannahross is offline
Junior Member
 
Join Date: Jun 2008
Posts: 13
Default google scrap

i read your post for the scraping i cant get you clearly can you explain it
Digg this Post!Add Post to del.icio.usBookmark Post in TechnoratiFurl this Post!
Reply With Quote
  #8 (permalink)  
Old 08-04-2008, 02:47 PM
wcsupport wcsupport is offline
Junior Member
 
Join Date: Aug 2008
Posts: 7
Default

Hi,

Just wondering why you would want to save google content and present it as your own?.

You can add a google search onto your website if you want to give people the oppertunity to search.

Just curious
__________________
Steve
I Want To Go As Fancy Dress
Digg this Post!Add Post to del.icio.usBookmark Post in TechnoratiFurl this Post!
Reply With Quote
  #9 (permalink)  
Old 09-26-2008, 08:40 PM
TheGamerX TheGamerX is offline
Senior Member
 
Join Date: Aug 2008
Posts: 107
Default

looks good, but won't google penalize for this?
__________________
My favorite kind of Online games are
MMORPG | MMO | Free MMORPG |Free MMO | Games like MapleStory and Warcraft are amazing . MMORPGs are fun because they're social. You interact with more people.
Digg this Post!Add Post to del.icio.usBookmark Post in TechnoratiFurl this Post!
Reply With Quote
  #10 (permalink)  
Old 10-02-2008, 03:39 PM
emma3433 emma3433 is offline
Junior Member
 
Join Date: Oct 2008
Posts: 4
Default

yes i agree
Digg this Post!Add Post to del.icio.usBookmark Post in TechnoratiFurl this Post!
Reply With Quote
Reply


Thread Tools
Display Modes

Posting Rules
You may not post new threads
You may not post replies
You may not post attachments
You may not edit your posts

vB code is On
Smilies are On
[IMG] code is On
HTML code is Off
Trackbacks are On
Pingbacks are On
Refbacks are On



All times are GMT. The time now is 12:03 PM.


Sponsored Links

Powered by vBulletin® Version 3.6.7
Copyright ©2000 - 2008, Jelsoft Enterprises Ltd.
SEO by vBSEO 3.0.0
vB Ad Management by =RedTyger=