Using Web Bots to hunt for B2B marketing leads
How we got vendor email addresses
Lets use houzz.com (educational purposes) as our target for this example, goal is to obtain email addresses from the sites online vendors directory.
Problem is the email addresses ARE NOT available on HOUZZ's website.
We'll walk you through how we overcame this problem and got what we needed.
analysis of site and strategy used
Houzz vendor listings
targets indexed
So we use this page to index all the vendors aka our targets.
Vendor Detail Page
Email Workaround
We then use our bot to visit each vendors page, and collect any relative vendors details. Unfortunately no email address is listed. but. they do provide the vendors website, so we'll send our bot there next
Vendors Website
Obtain Payload
Our bot is now on the vendors website, we instruct it to scour the sites pages in search for email address (aka payload)
Here is the code which you can access bitbucket repository
The MySQL code is commented out, in case youd rather store the data retrieved in a database. I opted to place results in a CSV file.
The script write 2 CSV files, one for indexing website URLs in step 1.
In step 2 we call the CSV file of target of vendor website URLs to search for the email addresses
Feedback is outputted in the terminal using fwrite(STDOUT)
//You can get these files over at my https:\/\/internettechnologyservices.com//internettechnologyservices.com//bitbucket.org/nicknguyenzrd/houzzbot/
require("crawler.php");
require('CSSQuery.php');
error_reporting(E_ERROR | E_PARSE);
//@ini_set('display_errors', 0);
/* Uncomment below to store data in MYSQL
$servername = "localhost";
$username = "root";
$password = "";
$dbname = "invoice";
// Create connection
$conn = new mysqli($servername, $username, $password, $dbname);
// Check connection
if ($conn->connect_error) {
die("Connection failed: " . $conn->connect_error);
}
*/
//Step 1 Gather Houzz Links
//Open Links File cause thats where well dump our data payload
$handle = fopen("links.txt", "r");
$id=94;
$type=1;
//Data Placeholder Array
$data['href']=array();
$data['company']=array();
$data['type']=array();
$data['id']=array();
$id=1;
//Deal with multiple page results with The All Powerful Iterative Loop
for ($i = 1; $i <= 30; $i++) { $doc = new DOMDocument(); if($i===1) {$p=0; //To grab the first page had a different URL $doc->loadHTML( file_get_contents( "http://www.houzz.com/professionals/landscape-architect/orange-county"));
}else{
//Every Page after the first page "/p/{page number}"
$doc->loadHTML( file_get_contents( "http://www.houzz.com/professionals/landscape-architect/orange-county/p/" . $p ) );
}
//Webpage loaded for us
$css = new CSSQuery( $doc );
$arr = array();
$arr = $css->query( 'a.pro-title' );
foreach ( $arr as $a ) {
//Get URL Link Filter out Javascript
if ( $a->attributes->getNamedItem( 'href' )->value === "javascript:;" ) {
} else {
//Store link and company name
$data['id'][]=$id;
$data['href'][] = $a->attributes->getNamedItem( 'href' )->value;
$data['company'][] = $a->nodeValue;
$data['type'][]=1;
//Open our List of Links Page
$handle = fopen('links.txt',"a+");
$somecontent = $a->attributes->getNamedItem( 'href' )->value."\r\n";
fwrite($handle,$somecontent);
fwrite(STDOUT, $somecontent);
fclose($handle);
$id++;
}
}
$p=$p+15;
sleep(1);
unset($doc);
unset($css);
//var_dump( $data );
}
//Step 2 Gather company details (Houzz doesnt list email addresses), so well have to improvise and go to there website to acquire target email contact if its listed on there website.
//Make sure we double check were dealing with valid URLS cause that can really fuck things up once this bitch is fired up!
function get_valid_url( $url ) {
$regex = "((https?|ftp)\:\/\/)?"; // Scheme
$regex .= "([a-z0-9+!*(),;?&=\$_.-]+(\:[a-z0-9+!*(),;?&=\$_.-]+)?@)?"; // User and Pass
$regex .= "([a-z0-9-.]*)\.([a-z]{2,3})"; // Host or IP
$regex .= "(\:[0-9]{2,5})?"; // Port
$regex .= "(\/([a-z0-9+\$_-]\.?)+)*\/?"; // Path
$regex .= "(\?[a-z+&\$_.-][a-z0-9;:@&%=+\/\$_.-]*)?"; // GET Query
$regex .= "(#[a-z_.-][a-z0-9+\$_.-]*)?"; // Anchor
return preg_match("/^$regex$/", $url);
}
if ($handle) {
while ( ( $line = fgets( $handle ) ) !== false ) {
$email="";
$website="";
$url="";
$name="";
$company="";
$phone="";
$link="";
$tier="";
$location="";
$license="";
$error="";
$sql="";
$doc = new DOMDocument();
$doc->loadHTML( file_get_contents( $line ) );
$css = new CSSQuery( $doc );
//Houzz Link to profile
$data['link']=$line;
$link=trim($line);
//Company Name
$nrr = $css->query( 'a.profile-full-name' );
$data['company'][] = $nrr[0]->textContent;
$company=$nrr[0]->textContent;
fwrite(STDOUT, "Starting: ".$id.":".$nrr[0]->textContent."\r\n");
//Website and Email Addresses TODO add conditional statement
$arr = $css->query( 'a.proWebsiteLink' );
foreach ( $arr as $a ) {
$url= $a->attributes->getNamedItem( 'href' )->value;
if(get_valid_url($url)) {
$data['website'][] = $url;
$website=$url;
fwrite(STDOUT, "Attempting site: ".$url."\r\n");
$parse = parse_url($url);
$foo = new crawler($url,$parse['host'],2,true,true);
$result=$foo->init();
//Found Email Address
if(isset($result['emails'][0])){
$email=$result['emails'][0];
//Output indicating email address discovered CLI
fwrite(STDOUT, "Found Email Address: ".$result['emails'][0]."\r\n");
}
}
}
//Phone Number
$crr = $css->query( 'span.pro-contact-text' );
foreach ( $crr as $c ) {
if($c->nodeValue!=="Website") {
$data['phone'][] = $c->nodeValue;
$phone = $c->nodeValue;
}
}
//All company details
$info = $css->query( 'div.info-list-text' );
$str="";
foreach ( $info as $i ) {
$test = $i->nodeValue;
//Person to contact
if (strpos( $test, "Contact:" )!==FALSE) {
$name= str_replace( "Contact:",'', $test );
$name=trim($name);
$data['contact'][] =$name;
}
//Address
if (strpos( $test, "Location:" )!==FALSE) {
$location = str_replace( "Location:",'', $test );
$location=trim($location);
$data['location'][]=$location;
}
//License Number
if (strpos( $test, "License Number:" )!==FALSE) {
$license=str_replace( "License Number:",'', $test );
$license=trim($license);
$data['license'][] =$license;
}
//Tier
if (strpos( $test, "Typical Job Costs:" )!==FALSE) {
$tier =str_replace( "Typical Job Costs:",'', $test );
$tier=trim($tier);
$data['tier'][]=$tier;
}
}
//Write architect contact information into a CSV file
$wr= fopen('archs.csv',"a+");
$str=trim($str);
//Architect Record
$details = $id.",".$type.",\"".$company."\",\"".$phone."\",\"".$url."\",\"".$email."\",\"".$link."\",\"".$contact."\",\"".$location."\",\"".$license."\",\"".$tier."\" \r\n";
fwrite($wr,$details);
//Disable Comment Below to OutPut to CLI
//fwrite(STDOUT, $details);
$id++;
fclose($wr);
/* Uncomment below if youd rather insert scrapped data into MySQL Database
$sql = "INSERT INTO ip_oppurtunities(`type`,`company`,`phone`,`website`,`email`,`link`,`contact`,`location`,`license`,`tier`)
VALUES (1,'$company','$phone','$website','$email','$link','$name','$location','$license','$tier')";
if ($conn->query($sql) === TRUE) {
fwrite(STDOUT, $id.'-'.$company." Added \r\n");
} else {
$error=mysqli_error($conn);
fwrite(STDOUT, "Error: ".$company."=[".$sql."]".$error."\r\n");
echo $error;
die();
}
$conn->close();
$id++;
*/
}
fclose($handle);
}