I threw this together one afternoon as I was frustrated that even though Tesco displays the 'price-per-ml' for things like cans and bottles of Coca Cola, it doesn't let you sort by 'price-per-ml'. So I wrote a quick script that will download an entire sub category of products, extract the data from them, then output them all to a CSV that I can filter/sort myself.
It was easier than I expected (I didn't have to do much scraping) as there is a data-element in the body tag of the page that has all the product detail in json format.
<?php // Download a Tesco category of products to a CSV, so I can sort by price/ml // I tried to do the download with curl/file_get_contents but the Tesco website // has a load of javascript, so I eventually ended up just using wget to deal with it $sub_category = "drinks/fizzy-and-soft-drinks"; $output_file = "fizzy-drinks"; $output_csv_file = $output_file.".csv"; $number_of_items_per_page = 48; // 48 is the maximum $max_pages_to_download = 20; // shouldn't ever be higher than this $user_agent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)"; // Tesco/wget wants this $url = "https://www.tesco.com/groceries/en-GB/shop/$sub_category/all?sortBy=price-descending&count=$number_of_items_per_page"; echo "Downloading $url\n"; // do the first download (so we can calculate how many pages we need to get the rest) exec("wget -q --user-agent=\"$user_agent\" \"$url\" -O $output_file-1.html"); // load the first page we just downloaded from disk $first_page = file_get_contents($output_file."-1.html"); // find the 'Showing x to x of x items' element and use that to work out how many pages $doc= new DOMDocument(); @$doc->loadHTML($first_page); // @ to suppress a load of DOM errors on Tesco's badly formatted page $finder = new DomXPath($doc); $pages_class = "pagination__items-displayed"; $pages = $finder->query("//*[contains(@class, '$pages_class')]"); $page_text = $pages[0]->nodeValue; preg_match("/Showing \d+ to \d+ of (\d+)/", $page_text, $match); $total_items = $match[1]; $pages = ceil($total_items/$number_of_items_per_page); if($pages >= $max_pages_to_download) exit("Number of pages to download ($pages) exceed max threshold"); // download the rest of the pages to disk for($i=2;$i<=$pages;$i++) { $next_url = $url."&page=$i"; echo "Downloading $next_url\n"; exec("wget -q --user-agent=\"$user_agent\" \"$next_url\" -O $output_file-$i.html"); } $lines = array(); $lines[] = array('Product Name', 'Tesco URL', 'Brand', 'Price', 'Price per unit', 'Unit Measure'); // read through each of the pages, parse the json and output as csv echo "Parsing JSON to CSV\n"; for($i=1;$i<=$pages;$i++) { $json_id = $i - 1; $file_content = file_get_contents($output_file."-".$i.".html"); $doc= new DOMDocument(); @$doc->loadHTML($file_content); $body = $doc->getElementsByTagName('body')->item(0); // get all product date on the page in json format (it's a data-attribute of the body element) $items = $body->getAttribute('data-redux-state'); file_put_contents($output_file."-$i.json", $items); $json = json_decode(str_replace(""", "'", $items)); foreach($json->results->pages[$json_id]->serializedData as $val) { foreach($val as $product_id=>$item) { if(isset($item->product)) { $row = array(); $row[] = $item->product->title; $row[] = "https://www.tesco.com/groceries/en-GB/products/".$item->product->id; $row[] = $item->product->brandName; $row[] = $item->product->price; $row[] = $item->product->unitPrice; $row[] = $item->product->unitOfMeasure; $lines[] = $row; } } } } echo "Writing CSV\n"; // write all the data to the CSV $fp = fopen($output_csv_file, "w"); foreach($lines as $line) { fputcsv($fp, $line); }
php tesco-to-csv.php Downloading https://www.tesco.com/groceries/en-GB/shop/drinks/fizzy-and-soft-drinks/all?sortBy=price-descending&count=48 Downloading https://www.tesco.com/groceries/en-GB/shop/drinks/fizzy-and-soft-drinks/all?sortBy=price-descending&count=48&page=2 Downloading https://www.tesco.com/groceries/en-GB/shop/drinks/fizzy-and-soft-drinks/all?sortBy=price-descending&count=48&page=3 Downloading https://www.tesco.com/groceries/en-GB/shop/drinks/fizzy-and-soft-drinks/all?sortBy=price-descending&count=48&page=4 Downloading https://www.tesco.com/groceries/en-GB/shop/drinks/fizzy-and-soft-drinks/all?sortBy=price-descending&count=48&page=5 Parsing JSON to CSV Writing CSV