python - Access text elements with dynamically loaded classes using Selenium - Stack Overflow

I am trying to fetch the co-ordinates for real estate properties but I am unable to fetch the longitude

I am trying to fetch the co-ordinates for real estate properties but I am unable to fetch the longitude and latitude as it opens without a proper location, I also tried getting them via the google map url but due to some reason the current url doesn't give me the gmaps url but the website (real estate) instead. Can anyone help me? The images for the exact button and google map are: Website Button, Google Maps.

As you see, when the driver opens the google maps tab, the co-ordinates are listed there, those are the ones I am trying to fetch but can't seem to because the class is dynamically loaded. I saw some tutorials using the div tag with role, which here is main, but that doesn't seem to work either.

As my last option I am using the current google maps url to fetch the coordinates which changes a bit here and there as the sites change in the loop. but doing it using the html part of google maps is something I want to figure out and need help with. Looking forward to any answers!

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from time import sleep
from selenium.webdriver.chrome.service import Service
from selenium.webdrivermon.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

path = r'chromedriver.exe'

service = Service(path)
browser = webdriver.Chrome(service=service)

links= []
num = 0

url = f';page=1'
    
headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; AOL 9.0; Windows NT 5.1; {}; Alawar 2.08; .NET CLR 1.0.3705)'}
    
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.content, 'html')

sites = soup.find_all('div',class_='listingviewtiles')

for site in sites:
    for link in site.find_all('a',class_='img-anchor', href=True):
        links.append(link['href'])
            
num+=1
print(num, 'pages done')

link = links[1]

r = requests.get(link, headers=headers)
soup2 = BeautifulSoup(r.content, 'html')
price = soup2.find('div', class_ = 'sc-jj9qsf-0 fmMIFj sc-gs8z27-5 ksEDsX').text.strip()

print(price)

browser.get(link)
browser.maximize_window()
location_button = browser.find_element(By.CSS_SELECTOR, '.sc-ldbotn-0.hdNuPA.sc-yzpzu-3.eLCRYW.hA.bt2.bt1')
sleep(0.5)
browser.execute_script("arguments[0].click();", location_button)
google_maps_url = browser.current_url
print(f"Google Maps URL: {google_maps_url}")

import re

current_url = browser.current_url
print("Google Maps URL:", current_url)
# Regex pattern to find latitude and longitude
match = re.search(r"@([-.\d]+),([-.\d]+)", current_url)

if match:
    latitude, longitude = match.groups()
    print(f"Latitude: {latitude}, Longitude: {longitude}")
else:
    print("Coordinates not found in URL!")

I am trying to fetch the co-ordinates for real estate properties but I am unable to fetch the longitude and latitude as it opens without a proper location, I also tried getting them via the google map url but due to some reason the current url doesn't give me the gmaps url but the website (real estate) instead. Can anyone help me? The images for the exact button and google map are: Website Button, Google Maps.

As you see, when the driver opens the google maps tab, the co-ordinates are listed there, those are the ones I am trying to fetch but can't seem to because the class is dynamically loaded. I saw some tutorials using the div tag with role, which here is main, but that doesn't seem to work either.

As my last option I am using the current google maps url to fetch the coordinates which changes a bit here and there as the sites change in the loop. but doing it using the html part of google maps is something I want to figure out and need help with. Looking forward to any answers!

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from time import sleep
from selenium.webdriver.chrome.service import Service
from selenium.webdrivermon.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

path = r'chromedriver.exe'

service = Service(path)
browser = webdriver.Chrome(service=service)

links= []
num = 0

url = f'https://www.quikr/homes/property/residential-for-sale-in-bangalore-cid_23?q=eyJjbHVzdGVyQ2l0eSI6WyIyMyJdfQ%3D%3D&page=1'
    
headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; AOL 9.0; Windows NT 5.1; {}; Alawar 2.08; .NET CLR 1.0.3705)'}
    
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.content, 'html')

sites = soup.find_all('div',class_='listingviewtiles')

for site in sites:
    for link in site.find_all('a',class_='img-anchor', href=True):
        links.append(link['href'])
            
num+=1
print(num, 'pages done')

link = links[1]

r = requests.get(link, headers=headers)
soup2 = BeautifulSoup(r.content, 'html')
price = soup2.find('div', class_ = 'sc-jj9qsf-0 fmMIFj sc-gs8z27-5 ksEDsX').text.strip()

print(price)

browser.get(link)
browser.maximize_window()
location_button = browser.find_element(By.CSS_SELECTOR, '.sc-ldbotn-0.hdNuPA.sc-yzpzu-3.eLCRYW.hA.bt2.bt1')
sleep(0.5)
browser.execute_script("arguments[0].click();", location_button)
google_maps_url = browser.current_url
print(f"Google Maps URL: {google_maps_url}")

import re

current_url = browser.current_url
print("Google Maps URL:", current_url)
# Regex pattern to find latitude and longitude
match = re.search(r"@([-.\d]+),([-.\d]+)", current_url)

if match:
    latitude, longitude = match.groups()
    print(f"Latitude: {latitude}, Longitude: {longitude}")
else:
    print("Coordinates not found in URL!")
Share Improve this question asked Mar 12 at 11:49 DEEPENDRA SINGH RAO 2348316DEEPENDRA SINGH RAO 2348316 111 silver badge4 bronze badges 2
  • 1 why do you import selenium if you don't use it but you try to get it with requests and BeautifulSoup. If page has dynamic content (using JavaScript) then requests can be useless because it can't run JavaScript and you may have to use Selenium which can run JavaScript. – furas Commented Mar 12 at 12:24
  • 2 at the end of HTML I see tag <script> with data and there is also {"__typename":"Coordinates","latitude":"13.079402","longitude":"77.661926"} – furas Commented Mar 12 at 12:56
Add a comment  | 

2 Answers 2

Reset to default 2

As noted by @furas, the information is contained in the <script> at the bottom of the page source and can also be accessed directly there. His answer with the focus on it will surely come soon.

But to follow your approach, in case the information is not available - Select more specific by element id:

WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div[id=“Location”] button'))).click()

Navigate to the second tab after clicking:

WebDriverWait(driver, 30).until(lambda d: len(d.window_handles) > 1)
driver.switch_to.window(driver.window_handles[1])

Example for a single page:

from selenium import webdriver
from selenium.webdrivermon.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re

driver = webdriver.Chrome()

url = f'https://www.quikr/homes/3-bhk-apartment-of-2036sqft-for-sale-in-radiance-gardenia-bangalore/p/372255534/272495?source=qh'
driver.get(url)

WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div[id="Location"] button'))).click()

WebDriverWait(driver, 30).until(lambda d: len(d.window_handles) > 1)
driver.switch_to.window(driver.window_handles[1])

# in case if you have to handle google consent page
# try:
#     consent_button = WebDriverWait(driver, 5).until(
#         EC.element_to_be_clickable((By.XPATH, "//button[span[contains(text(), 'Alle ablehnen')]]"))
#     )
#     consent_button.click()
# except Exception as e:
#     print("Consent-Button not found:", e)

match = re.search(r"/([-+]?\d*\.\d+),([-+]?\d*\.\d+)", driver.current_url)

if match:
    latitude = float(match.group(1))
    longitude = float(match.group(2))
    print("Latitude:", latitude)
    print("Longitude:", longitude)
else:
    print("Coordinates not found in URL!")

driver.switch_to.window(driver.window_handles[0])

Result:

Latitude: 13.079402
Longitude: 77.661926

As mentioned by the others, the latitude, longitude, and all other pieces of information are contained inside the first script tag which can be directly accessed by simply using requests library.

Since there's only one latitude and longitude keyword in the whole HTML content, it can be extracted easily using regex.

Below is the simple implementation:

import re
import requests

response = requests.get(url='https://www.quikr/homes/3-bhk-apartment-of-2036sqft-for-sale-in-radiance-gardenia-bangalore/p/372255534/272495?source=qh',
                        headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"}
                        )
result = {}
if response.status_code == 200:
    pattern = r'latitude":"(.+)","longitude":"(.+)"},"adlink"'
    matches = re.findall(pattern=pattern, string=response.text)

    result["latitude"] = matches[0][0]
    result["longitude"] = matches[0][1]

print(result)

output:

{'latitude': '13.079402', 'longitude': '77.661926'}

Note: It's necessary to pass the User-Agent header to get the successful 200 response status.

发布者:admin,转转请注明出处:http://www.yc00.com/questions/1744754637a4591805.html

相关推荐

发表回复

评论列表(0条)

  • 暂无评论

联系我们

400-800-8888

在线咨询: QQ交谈

邮件:admin@example.com

工作时间:周一至周五,9:30-18:30,节假日休息

关注微信