loadHTML($first_page); // @ to suppress a load of DOM errors on Tesco's badly formatted page $finder = new DomXPath($doc); $pages_class = "pagination__items-displayed"; $pages = $finder->query("//*[contains(@class, '$pages_class')]"); $page_text = $pages[0]->nodeValue; preg_match("/Showing \d+ to \d+ of (\d+)/", $page_text, $match); $total_items = $match[1]; $pages = ceil($total_items/$number_of_items_per_page); if($pages >= $max_pages_to_download) exit("Number of pages to download ($pages) exceed max threshold"); // download the rest of the pages to disk for($i=2;$i<=$pages;$i++) { $next_url = $url."&page=$i"; echo "Downloading $next_url\n"; exec("wget -q --user-agent=\"$user_agent\" \"$next_url\" -O $output_file-$i.html"); } $lines = array(); $lines[] = array('Product Name', 'Tesco URL', 'Brand', 'Price', 'Price per unit', 'Unit Measure'); // read through each of the pages, parse the json and output as csv echo "Parsing JSON to CSV\n"; for($i=1;$i<=$pages;$i++) { $json_id = $i - 1; $file_content = file_get_contents($output_file."-".$i.".html"); $doc= new DOMDocument(); @$doc->loadHTML($file_content); $body = $doc->getElementsByTagName('body')->item(0); // get all product date on the page in json format (it's a data-attribute of the body element) $items = $body->getAttribute('data-redux-state'); file_put_contents($output_file."-$i.json", $items); $json = json_decode(str_replace(""", "'", $items)); foreach($json->results->pages[$json_id]->serializedData as $val) { foreach($val as $product_id=>$item) { if(isset($item->product)) { $row = array(); $row[] = $item->product->title; $row[] = "https://www.tesco.com/groceries/en-GB/products/".$item->product->id; $row[] = $item->product->brandName; $row[] = $item->product->price; $row[] = $item->product->unitPrice; $row[] = $item->product->unitOfMeasure; $lines[] = $row; } } } } echo "Writing CSV\n"; // write all the data to the CSV $fp = fopen($output_csv_file, "w"); foreach($lines as $line) { fputcsv($fp, $line); }