""" Carwow reader. Reads the logged in Carwow dealer stock list at dealers.carwow.co.uk and turns each card into a Car. Carwow shows more on the card than Motorway: registration, make, model and derivative, year, mileage, fuel, transmission, service history, reserve, CAP value, condition grade and distance, plus the listing state. Auction only. Steven never buys a buy it now listing, so any card whose state is not an auction state is discarded here, on both platforms. Nothing here bids. It reads the screen only. """ import re import json import html as htmllib from ..pricing import Car from .. import browser PLATE = re.compile(r"\b[A-Z]{2}\d{2}\s?[A-Z]{3}\b") YEAR = re.compile(r"^(19[89]\d|20[0-4]\d)$") DISTANCE = re.compile(r"^(\d+)\s*miles? away$", re.I) MONEY = re.compile(r"£\s*([\d,]+)") PURE_NUM = re.compile(r"^\d{1,3}(?:,\d{3})+$|^\d{3,6}$") FUELS = {"petrol", "diesel", "hybrid", "electric", "petrol hybrid", "plug-in hybrid", "phev"} def _num(s): if s is None: return None s = str(s).replace(",", "").strip() return int(s) if s.isdigit() else None def _is_auction(state): # Auction listings carry a state mentioning auction, for example # waiting_for_auction or in_auction. Buy it now states do not. return "auction" in (state or "").lower() def _service_history(lines_lower): text = " ".join(lines_lower) if "no sh" in text or "no service history" in text: return "none" if "full" in text and "sh" in text or "fsh" in text: return "full" if "part" in text and ("sh" in text or "service" in text): return "partial" return "" def _card_blocks(page_html): blocks = re.split(r'(?=data-listing-id=")', page_html) return [b for b in blocks if b.startswith('data-listing-id=')] def parse_listing(page_html): cars = [] for block in _card_blocks(page_html): mid = re.match(r'data-listing-id="(\d+)"', block) listing_id = mid.group(1) if mid else "" state = re.search(r'data-listing-state="([^"]+)"', block) state = state.group(1) if state else "" # Auction only. if not _is_auction(state): continue # Photo: first image in the lazy photos JSON. photo = "" mph = re.search(r'lazy-photos-value="([^"]+)"', block) if mph: try: arr = json.loads(htmllib.unescape(mph.group(1))) if arr and isinstance(arr, list): photo = arr[0].get("url", "") except Exception: photo = "" # Drop svg and script noise, then read the visible lines. b = re.sub(r"]*>.*?", " ", block, flags=re.S) b = re.sub(r"]*>.*?", " ", b, flags=re.S) lines = [l.strip() for l in re.sub(r"<[^>]+>", "\n", htmllib.unescape(b)).split("\n") if l.strip()] lines_lower = [l.lower() for l in lines] reg = "" for l in lines: m = PLATE.search(l) if m: reg = m.group(0).replace(" ", "") break distance = None for l in lines: m = DISTANCE.match(l) if m: distance = _num(m.group(1)) break grade = None for i, l in enumerate(lines): if l.lower() == "grade" and i + 1 < len(lines): grade = _num(lines[i + 1]) break reserve = None for i, l in enumerate(lines): if "reserve price" in l.lower(): for j in range(i, min(i + 3, len(lines))): mm = MONEY.search(lines[j]) if mm: reserve = _num(mm.group(1)) break break cap = None for i, l in enumerate(lines): low = l.lower() if "cap" in low: if "no cap" in low: cap = None else: mm = MONEY.search(l) or (MONEY.search(lines[i + 1]) if i + 1 < len(lines) else None) if mm: cap = _num(mm.group(1)) break fuel = "" for l in lines: if l.lower() in FUELS: fuel = l break transmission = "" for l in lines: low = l.strip().lower() if low in ("manual", "automatic", "auto", "cvt", "semi-automatic", "semi automatic"): transmission = "Automatic" if low == "auto" else l.strip() break # Year, then the lines just before it give make, model and derivative. year = None year_idx = None for i, l in enumerate(lines): if YEAR.match(l): year = _num(l) year_idx = i break make = model = derivative = "" if year_idx is not None and year_idx >= 2: make_model = lines[year_idx - 2] derivative = lines[year_idx - 1] toks = make_model.split() if toks: make = toks[0] model = " ".join(toks[1:]) # Mileage: a pure number line that is not the year and looks like miles. mileage = None for i, l in enumerate(lines): if PURE_NUM.match(l) and _num(l) != year and (i != year_idx): v = _num(l) if v and v >= 100: mileage = v break sh = _service_history(lines_lower) car = Car( reg=reg, make=make, model=model, derivative=derivative, year=year, mileage=mileage, grade=grade, reserve=reserve, cap_clean=cap, distance_miles=distance, service_history=sh, engine=(derivative + (" " + fuel if fuel else "")).strip(), fuel=fuel, transmission=transmission, photo_url=photo, listing_url=browser.SITES["carwow"]["base_url"] + "/dealers/listings/" + listing_id, source="Carwow", ) car._listing_id = listing_id car._state = state car._fuel = fuel cars.append(car) return cars def enrich_from_detail(page, car): """Open a Carwow car's own page and read what the list card does not show: previous owners (Carwow calls them former keepers) and VAT qualifying. Reads the rendered page, leaving fields unset if they cannot be read so the brain can hold the car back rather than guess.""" if not car.listing_url: return car page.goto(car.listing_url, wait_until="domcontentloaded", timeout=45000) try: page.wait_for_load_state("networkidle", timeout=10000) except Exception: pass lines = [l.strip() for l in page.inner_text("body").split("\n") if l.strip()] def value_after(label): for i, l in enumerate(lines): if l.lower() == label.lower() and i + 1 < len(lines): return lines[i + 1].strip() return None owners = value_after("Former keepers") if owners and owners.isdigit(): car.owners = int(owners) vat = value_after("VAT qualifying") if vat is not None: car.vat_qualifying = vat.strip().lower() in ("yes", "y", "true") return car def read_live(playwright, limit=None, max_pages=25): """Read the whole filtered Carwow stock, page by page (Carwow paginates with &page=N). Accumulates auction cars across pages, stopping when a page shows no new listings. Auction only: non auction states (for example second chance quotes) are filtered out by parse_listing, which can legitimately leave zero cars outside the auction window.""" base = browser.SITES["carwow"]["stock_url"] sep = "&" if "?" in base else "?" cars = [] seen_cars = set() seen_raw = set() ctx = browser.open_reader_context(playwright, "carwow", headless=True) try: page = ctx.pages[0] if ctx.pages else ctx.new_page() for n in range(1, max_pages + 1): page.goto(f"{base}{sep}page={n}", wait_until="domcontentloaded", timeout=45000) try: page.wait_for_selector('[data-listing-id]', timeout=20000 if n == 1 else 8000) except Exception: break # no listings on this page, end of list html = page.content() raw_ids = set(re.findall(r'data-listing-id="(\d+)"', html)) new_raw = raw_ids - seen_raw if not new_raw: break # the site has run out of pages and is repeating seen_raw |= raw_ids for c in parse_listing(html): lid = getattr(c, "_listing_id", "") if lid and lid not in seen_cars: seen_cars.add(lid) cars.append(c) if limit and len(cars) >= limit: break finally: ctx.close() return cars[:limit] if limit else cars