import requests
from lxml import html
from bs4 import BeautifulSoup as soup
from urllib.parse import urlparse
import json
def getProductsByLink(url):
httpRequest = requests.get(url)
html = httpRequest.text
parsedHTML = soup(html, "html.parser")
products = parsedHTML.find("div", class_="products")
if products is None:
print("Error: " + url)
return []
productList=products.find_all("div", class_="prd")
productLinks = []
for element in productList:
a=element.find("a",href=True)
productLinks.append("https://www.penti.com"+a['href'])
return productLinks
url = 'https://www.penti.com/tr/c/boxer'
httpRequest = requests.get(url)
html = httpRequest.text
parsedHTML = soup(html, "html.parser")
productCount = int((parsedHTML.find("div", class_="plp-info").text).split()[0])
pageCount = int(productCount / 42)
productLinks = []
for i in range(pageCount+1):
page = i
link = getProductsByLink(f'{url}?page={page}')
if link is not None:
productLinks.extend(link)
def getProductHTML(url):
response = requests.get(url)
html = response.text
parsedHTML = soup(html, "html.parser")
return parsedHTML
def getProductCodesAndSizes(parsedHTML,url):
sizeDropdown = parsedHTML.find("div", class_="dropdown size-dropdown")
sizeDropdownItems = sizeDropdown.find_all("a", class_="dropdown-item")
sizes = [str(item['data-code']).replace(url.split('/')[-1].replace("-", ""),"") for item in sizeDropdownItems]
return url.split('/')[-1].replace("-", ""), sizes
def getStoresAndSizes(productCode,sizes):
data = {}
for size in sizes:
data.update({size:""})
url = "https://www.penti.com/tr/store-finder/find-store"
payload = {
'productCode': productCode + size,
'cityCode': '34',
'townCode': '0',
'pageType': 'PRODUCT'
}
response = requests.post(url, data=payload)
if response.status_code == 200:
html = response.text
parsedHTML = soup(html, "html.parser")
sli = parsedHTML.find_all("div", class_="sli")
data[size] = {"satus": "success", "stores": []}
for store in sli:
info = store.find("div", class_="sli-info-left")
if info is not None:
name = info.find("div", class_="sli-title").get_text(strip=True)
whitelist = [
"beylikdüzü",
"avcılar",
"esenyurt",
"büyükçekmece",
"bahçelievler"
]
location = info.find("div", class_="sli-address").get_text(strip=True)
if any(whitelisted in location.lower() for whitelisted in whitelist):
if "stores" not in data[size]:
data[size]["stores"] = []
data[size]["stores"].append({
"name": name,
"location": location
})
if not data[size]["stores"]:
data[size]["status"] = "failed"
else:
data[size]={
"status": "failed",
}
keys_to_delete = [size for size, info in data.items() if info.get("status") == "failed"]
for size in keys_to_delete:
del data[size]
sizes=[]
for key in data.keys():
sizes.append(key)
return data,sizes
def getImageLinks(parsedHTML):
gallery = parsedHTML.find('div', class_='pdp-gallery')
image_tags = gallery.find_all('img')
image_links = []
for img in image_tags:
if img.get('src'):
image_links.append(img['src'])
elif img.get('srcset'):
image_links.append(img['srcset'])
elif img.get('data-srcset'):
image_links.append(img['data-srcset'])
return image_links
def getTitle(parsedHTML):
title = parsedHTML.find('h1', class_='pdp-title')
return(title.text.strip())
def getProductInfo(url):
parsedHTML = getProductHTML(url)
productCode,sizes = getProductCodesAndSizes(parsedHTML,url)
stores, sizes = getStoresAndSizes(productCode,sizes)
imageLinks = getImageLinks(parsedHTML)
title = getTitle(parsedHTML)
data = {
"title": title,
"url": url,
"imageLinks": imageLinks,
"stores": stores,
}
return data
with open('data.json', 'w') as file:
file.write("[\n")
for index, url in enumerate(productLinks):
product_info = getProductInfo(url)
with open('data.json', 'a') as file:
json.dump(product_info, file, indent=4)
if index < len(productLinks) - 1:
file.write(",")
file.write("\n")
print(str(int((index+1)*100/len(productLinks))) + "% -> " + url)
with open('data.json', 'a') as file:
file.write("\n]")