====== Tesco Scraper ======
I threw this together one afternoon as I was frustrated that even though Tesco displays the 'price-per-ml' for things like cans and bottles of Coca Cola, it doesn't let you sort by 'price-per-ml'. So I wrote a quick script that will download an entire sub category of products, extract the data from them, then output them all to a CSV that I can filter/sort myself.
It was easier than I expected (I didn't have to do much scraping) as there is a data-element in the body tag of the page that has all the product detail in json format.
loadHTML($first_page); // @ to suppress a load of DOM errors on Tesco's badly formatted page
$finder = new DomXPath($doc);
$pages_class = "pagination__items-displayed";
$pages = $finder->query("//*[contains(@class, '$pages_class')]");
$page_text = $pages[0]->nodeValue;
preg_match("/Showing \d+ to \d+ of (\d+)/", $page_text, $match);
$total_items = $match[1];
$pages = ceil($total_items/$number_of_items_per_page);
if($pages >= $max_pages_to_download)
exit("Number of pages to download ($pages) exceed max threshold");
// download the rest of the pages to disk
for($i=2;$i<=$pages;$i++) {
$next_url = $url."&page=$i";
echo "Downloading $next_url\n";
exec("wget -q --user-agent=\"$user_agent\" \"$next_url\" -O $output_file-$i.html");
}
$lines = array();
$lines[] = array('Product Name', 'Tesco URL', 'Brand', 'Price', 'Price per unit', 'Unit Measure');
// read through each of the pages, parse the json and output as csv
echo "Parsing JSON to CSV\n";
for($i=1;$i<=$pages;$i++) {
$json_id = $i - 1;
$file_content = file_get_contents($output_file."-".$i.".html");
$doc= new DOMDocument();
@$doc->loadHTML($file_content);
$body = $doc->getElementsByTagName('body')->item(0);
// get all product date on the page in json format (it's a data-attribute of the body element)
$items = $body->getAttribute('data-redux-state');
file_put_contents($output_file."-$i.json", $items);
$json = json_decode(str_replace(""", "'", $items));
foreach($json->results->pages[$json_id]->serializedData as $val) {
foreach($val as $product_id=>$item) {
if(isset($item->product)) {
$row = array();
$row[] = $item->product->title;
$row[] = "https://www.tesco.com/groceries/en-GB/products/".$item->product->id;
$row[] = $item->product->brandName;
$row[] = $item->product->price;
$row[] = $item->product->unitPrice;
$row[] = $item->product->unitOfMeasure;
$lines[] = $row;
}
}
}
}
echo "Writing CSV\n";
// write all the data to the CSV
$fp = fopen($output_csv_file, "w");
foreach($lines as $line) {
fputcsv($fp, $line);
}
php tesco-to-csv.php
Downloading https://www.tesco.com/groceries/en-GB/shop/drinks/fizzy-and-soft-drinks/all?sortBy=price-descending&count=48
Downloading https://www.tesco.com/groceries/en-GB/shop/drinks/fizzy-and-soft-drinks/all?sortBy=price-descending&count=48&page=2
Downloading https://www.tesco.com/groceries/en-GB/shop/drinks/fizzy-and-soft-drinks/all?sortBy=price-descending&count=48&page=3
Downloading https://www.tesco.com/groceries/en-GB/shop/drinks/fizzy-and-soft-drinks/all?sortBy=price-descending&count=48&page=4
Downloading https://www.tesco.com/groceries/en-GB/shop/drinks/fizzy-and-soft-drinks/all?sortBy=price-descending&count=48&page=5
Parsing JSON to CSV
Writing CSV
{{:pasted:20220502-153759.png}}