import requests from lxml import html from bs4 import BeautifulSoup as soup from urllib.parse import urlparse import json def getProductsByLink(url): httpRequest = requests.get(url) html = httpRequest.text parsedHTML = soup(html, "html.parser") products = parsedHTML.find("div", class_="products") if products is None: print("Error: " + url) return [] productList=products.find_all("div", class_="prd") productLinks = [] for element in productList: a=element.find("a",href=True) productLinks.append("https://www.penti.com"+a['href']) return productLinks url = 'https://www.penti.com/tr/c/boxer' httpRequest = requests.get(url) html = httpRequest.text parsedHTML = soup(html, "html.parser") productCount = int((parsedHTML.find("div", class_="plp-info").text).split()[0]) pageCount = int(productCount / 42) productLinks = [] for i in range(pageCount+1): page = i link = getProductsByLink(f'{url}?page={page}') if link is not None: productLinks.extend(link) def getProductHTML(url): response = requests.get(url) html = response.text parsedHTML = soup(html, "html.parser") return parsedHTML def getProductCodesAndSizes(parsedHTML,url): sizeDropdown = parsedHTML.find("div", class_="dropdown size-dropdown") sizeDropdownItems = sizeDropdown.find_all("a", class_="dropdown-item") sizes = [str(item['data-code']).replace(url.split('/')[-1].replace("-", ""),"") for item in sizeDropdownItems] return url.split('/')[-1].replace("-", ""), sizes def getStoresAndSizes(productCode,sizes): data = {} for size in sizes: data.update({size:""}) url = "https://www.penti.com/tr/store-finder/find-store" payload = { 'productCode': productCode + size, 'cityCode': '34', 'townCode': '0', 'pageType': 'PRODUCT' } response = requests.post(url, data=payload) if response.status_code == 200: html = response.text parsedHTML = soup(html, "html.parser") sli = parsedHTML.find_all("div", class_="sli") data[size] = {"satus": "success", "stores": []} for store in sli: info = store.find("div", class_="sli-info-left") if info is not None: name = info.find("div", class_="sli-title").get_text(strip=True) whitelist = [ "beylikdüzü", "avcılar", "esenyurt", "büyükçekmece", "bahçelievler" ] location = info.find("div", class_="sli-address").get_text(strip=True) if any(whitelisted in location.lower() for whitelisted in whitelist): if "stores" not in data[size]: data[size]["stores"] = [] data[size]["stores"].append({ "name": name, "location": location }) if not data[size]["stores"]: data[size]["status"] = "failed" else: data[size]={ "status": "failed", } keys_to_delete = [size for size, info in data.items() if info.get("status") == "failed"] for size in keys_to_delete: del data[size] sizes=[] for key in data.keys(): sizes.append(key) return data,sizes def getImageLinks(parsedHTML): gallery = parsedHTML.find('div', class_='pdp-gallery') image_tags = gallery.find_all('img') image_links = [] for img in image_tags: if img.get('src'): image_links.append(img['src']) elif img.get('srcset'): image_links.append(img['srcset']) elif img.get('data-srcset'): image_links.append(img['data-srcset']) return image_links def getTitle(parsedHTML): title = parsedHTML.find('h1', class_='pdp-title') return(title.text.strip()) def getProductInfo(url): parsedHTML = getProductHTML(url) productCode,sizes = getProductCodesAndSizes(parsedHTML,url) stores, sizes = getStoresAndSizes(productCode,sizes) imageLinks = getImageLinks(parsedHTML) title = getTitle(parsedHTML) data = { "title": title, "url": url, "imageLinks": imageLinks, "stores": stores, } return data with open('data.json', 'w') as file: file.write("[\n") for index, url in enumerate(productLinks): product_info = getProductInfo(url) with open('data.json', 'a') as file: json.dump(product_info, file, indent=4) if index < len(productLinks) - 1: file.write(",") file.write("\n") print(str(int((index+1)*100/len(productLinks))) + "% -> " + url) with open('data.json', 'a') as file: file.write("\n]")