(949)446-1716 Give us a call Mon-Fri 9am-5pm

Using Web Bots to hunt for B2B marketing leads

How we got vendor email addresses

Lets use houzz.com (educational purposes)  as our target for this example, goal is to obtain email addresses from the sites online vendors directory. 

Problem is the email addresses ARE NOT available on HOUZZ's website. 

We'll walk you through how we overcame this problem and got what we needed. 

The Houzz BOT at work....Console reporting back results

analysis of site and strategy used

mainpage
Houzz vendor listings

targets indexed

So we use this page to index all the vendors aka our targets.

detail_page
Vendor Detail Page

Email Workaround

We then use our bot to visit each vendors page, and collect any relative vendors details. Unfortunately no email address is listed. but. they do provide the vendors website, so we'll send our bot there next

website_email
Vendors Website

Obtain Payload

Our bot is now on the vendors website, we instruct it to scour the sites pages in search for email address (aka payload) 

Here is the code which you can access bitbucket repository

The MySQL code is commented out, in case youd rather store the data retrieved in a database. I opted to place results in a CSV file.

The script write 2 CSV files, one for indexing website URLs in step 1.

In step 2 we call the CSV file of target of vendor website URLs to search for the email addresses

Feedback is outputted in the terminal using fwrite(STDOUT)


//You can get these files over at my https://bitbucket.org/nicknguyenzrd/houzzbot/
require("crawler.php");
require('CSSQuery.php');

error_reporting(E_ERROR | E_PARSE);
//@ini_set('display_errors', 0);



/* Uncomment below to store data in MYSQL
$servername = "localhost";
$username = "root";
$password = "";
$dbname = "invoice";
// Create connection
$conn = new mysqli($servername, $username, $password, $dbname);
// Check connection
if ($conn->connect_error) {
	die("Connection failed: " . $conn->connect_error);
}
*/

//Step 1 Gather  Houzz Links

//Open Links File cause thats where well dump our data payload
$handle = fopen("links.txt", "r");
$id=94;
$type=1;
//Data Placeholder Array
$data['href']=array();
$data['company']=array();
$data['type']=array();
$data['id']=array();
$id=1;

//Deal with multiple page results with The All Powerful Iterative Loop 
for ($i = 1; $i <= 30; $i++) { $doc = new DOMDocument(); if($i===1) {$p=0; //To grab the first page had a different URL $doc->loadHTML( file_get_contents( "http://www.houzz.com/professionals/landscape-architect/orange-county"));
	}else{
//Every Page after the first page "/p/{page number}"
	$doc->loadHTML( file_get_contents( "http://www.houzz.com/professionals/landscape-architect/orange-county/p/" . $p ) );
	}
//Webpage loaded for us  
	$css = new CSSQuery( $doc );
	$arr = array();
	$arr = $css->query( 'a.pro-title' );
	foreach ( $arr as $a ) {
		//Get URL Link Filter out Javascript
		if ( $a->attributes->getNamedItem( 'href' )->value === "javascript:;" ) {
		} else {
			//Store link and company name
            $data['id'][]=$id;
			$data['href'][]    = $a->attributes->getNamedItem( 'href' )->value;
			$data['company'][] = $a->nodeValue;
			$data['type'][]=1;
			//Open our List of Links Page
			$handle = fopen('links.txt',"a+");
			$somecontent = $a->attributes->getNamedItem( 'href' )->value."\r\n";
			fwrite($handle,$somecontent);
			fwrite(STDOUT, $somecontent);
			fclose($handle);
			$id++;
		}

	}
	$p=$p+15;
	sleep(1);
	unset($doc);
	unset($css);
	//var_dump( $data );
}

//Step 2 Gather company details (Houzz doesnt list email addresses), so well have to improvise and go to there website to acquire target email contact if its listed on there website. 

//Make sure we double check were dealing with valid URLS cause that can really fuck things up once this bitch is fired up!
function get_valid_url( $url ) {
	$regex = "((https?|ftp)\:\/\/)?"; // Scheme
	$regex .= "([a-z0-9+!*(),;?&=\$_.-]+(\:[a-z0-9+!*(),;?&=\$_.-]+)?@)?"; // User and Pass
	$regex .= "([a-z0-9-.]*)\.([a-z]{2,3})"; // Host or IP
	$regex .= "(\:[0-9]{2,5})?"; // Port
	$regex .= "(\/([a-z0-9+\$_-]\.?)+)*\/?"; // Path
	$regex .= "(\?[a-z+&\$_.-][a-z0-9;:@&%=+\/\$_.-]*)?"; // GET Query
	$regex .= "(#[a-z_.-][a-z0-9+\$_.-]*)?"; // Anchor
	return preg_match("/^$regex$/", $url);
}

if ($handle) {
	while ( ( $line = fgets( $handle ) ) !== false ) {
		$email="";
		$website="";
		$url="";
		$name="";
		$company="";
		$phone="";
		$link="";
		$tier="";
		$location="";
		$license="";
		$error="";
		$sql="";
		$doc = new DOMDocument();
		$doc->loadHTML( file_get_contents( $line ) );
		$css               = new CSSQuery( $doc );
		//Houzz Link to profile
		$data['link']=$line;
		$link=trim($line);
		//Company Name
		$nrr               = $css->query( 'a.profile-full-name' );
		$data['company'][] = $nrr[0]->textContent;
		$company=$nrr[0]->textContent;
		fwrite(STDOUT, "Starting: ".$id.":".$nrr[0]->textContent."\r\n");
		//Website and Email Addresses TODO add conditional statement
		$arr               = $css->query( 'a.proWebsiteLink' );
		foreach ( $arr as $a ) {
			$url= $a->attributes->getNamedItem( 'href' )->value;
if(get_valid_url($url)) {
	$data['website'][] = $url;
	$website=$url;
	fwrite(STDOUT, "Attempting site: ".$url."\r\n");
	$parse = parse_url($url);
	$foo = new crawler($url,$parse['host'],2,true,true);
	$result=$foo->init();

//Found Email Address
	if(isset($result['emails'][0])){
		$email=$result['emails'][0];
		//Output indicating email address discovered CLI 
		fwrite(STDOUT, "Found Email Address: ".$result['emails'][0]."\r\n");
	}
}
		}
		//Phone Number
		$crr = $css->query( 'span.pro-contact-text' );
		foreach ( $crr as $c ) {
			if($c->nodeValue!=="Website") {
				$data['phone'][] = $c->nodeValue;
				$phone           = $c->nodeValue;
			}
		}
		//All company details
		$info = $css->query( 'div.info-list-text' );
		$str="";
		foreach ( $info as $i ) {
			$test = $i->nodeValue;
//Person to contact
			if (strpos( $test, "Contact:" )!==FALSE) {
				$name= str_replace( "Contact:",'', $test );
				$name=trim($name);
				$data['contact'][] =$name;
			}
//Address
			if (strpos( $test, "Location:" )!==FALSE) {
				$location = str_replace( "Location:",'', $test );
				$location=trim($location);
				$data['location'][]=$location;
			}
//License Number
			if (strpos( $test, "License Number:" )!==FALSE) {
				$license=str_replace( "License Number:",'', $test );
				$license=trim($license);
				$data['license'][] =$license;
			}
//Tier
			if (strpos( $test, "Typical Job Costs:" )!==FALSE) {
				$tier =str_replace( "Typical Job Costs:",'', $test );
				$tier=trim($tier);
				$data['tier'][]=$tier;
		
			}
		}
		
//Write architect contact information into a CSV file
		$wr= fopen('archs.csv',"a+");
		$str=trim($str);
//Architect Record
	$details = $id.",".$type.",\"".$company."\",\"".$phone."\",\"".$url."\",\"".$email."\",\"".$link."\",\"".$contact."\",\"".$location."\",\"".$license."\",\"".$tier."\" \r\n";
		fwrite($wr,$details);
		//Disable Comment Below to OutPut to CLI 
		//fwrite(STDOUT, $details);
		$id++;
		fclose($wr);

/*  Uncomment below if youd rather insert scrapped data into MySQL Database 

		$sql = "INSERT INTO ip_oppurtunities(`type`,`company`,`phone`,`website`,`email`,`link`,`contact`,`location`,`license`,`tier`)
VALUES (1,'$company','$phone','$website','$email','$link','$name','$location','$license','$tier')";
		if ($conn->query($sql) === TRUE) {
			fwrite(STDOUT, $id.'-'.$company." Added \r\n");
		} else {
			$error=mysqli_error($conn);
			fwrite(STDOUT,  "Error: ".$company."=[".$sql."]".$error."\r\n");
			echo $error;
			die();
		}
		$conn->close();
		$id++;
*/

	}
	fclose($handle);
}

Leave a Comment

You must be logged in to post a comment.