Skip to content

Commit 551d742

Browse files
committed
Merge branch 'update/parser-updates' into dev
2 parents 8008dd5 + 9e0548d commit 551d742

69 files changed

Lines changed: 22934 additions & 304 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

WebSearcher/classifiers/header_text.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -52,20 +52,21 @@ def _get_header_level_mapping(level) -> dict:
5252
TYPE_TO_H2_MAPPING = {
5353
"directions": ["Directions",
5454
"Ubicaciones"],
55-
"discussions_and_forums": ["Discussions and forums"],
56-
"general": ["Complementary Results",
57-
"Web Result with Site Links",
58-
"Web results",
59-
"Resultados de la Web",
60-
"AI-powered overview",
61-
"Visión general creada por IA",
62-
"Things to know",
63-
"Cosas que debes saber"],
55+
"discussions_and_forums": ["Discussions and forums",
56+
"Questions & answers"],
57+
"general": ["Complementary Results",
58+
"Web Result with Site Links",
59+
"Web results",
60+
"Resultados de la Web",
61+
"AI-powered overview",
62+
"Visión general creada por IA"],
6463
"images": ["Images",
6564
"Imágenes"],
6665
"jobs": ["Jobs",
6766
"Empleos"],
68-
"knowledge": ["Calculator Result",
67+
"knowledge": ["Things to know",
68+
"Cosas que debes saber",
69+
"Calculator Result",
6970
"Featured snippet from the web", "Fragmento destacado",
7071
"Finance Results", "Resumen de Mercado",
7172
"From sources across the web",
@@ -84,7 +85,9 @@ def _get_header_level_mapping(level) -> dict:
8485
"Albums", "Álbumes",
8586
"About", "Información",
8687
"Profiles", "Perfiles"],
87-
"local_news": ["Local news", "Noticias Locales"],
88+
"latest_from": ["Latest from"],
89+
"local_news": ["Local news", "Noticias Locales",
90+
"Latest in local"],
8891
"local_results": [
8992
"Local Results",
9093
"Locations",

WebSearcher/classifiers/main.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,21 @@ def classify(cmpt: bs4.element.Tag) -> str:
1313

1414
# Ordered list of classifiers to try
1515
component_classifiers = [
16+
ClassifyMain.locations, # Check locations (hotels, etc.) before top_stories
1617
ClassifyMain.top_stories, # Check top stories
1718
ClassifyMain.discussions_and_forums, # Check discussions and forums
1819
ClassifyHeaderText.classify, # Check levels 2 & 3 header text
1920
ClassifyMain.news_quotes, # Check news quotes
2021
ClassifyMain.img_cards, # Check image cards
2122
ClassifyMain.images, # Check images
23+
ClassifyMain.ai_overview, # Check AI overview
2224
ClassifyMain.knowledge_panel, # Check knowledge panel
2325
ClassifyMain.knowledge_block, # Check knowledge components
2426
ClassifyMain.banner, # Check for banners
2527
ClassifyMain.finance_panel, # Check finance panel (classify as knowledge)
2628
ClassifyMain.map_result, # Check for map results
2729
ClassifyMain.general_questions, # Check hybrid general questions
30+
ClassifyMain.short_videos, # Check short videos carousel
2831
ClassifyMain.twitter, # Check twitter cards and results
2932
ClassifyMain.general, # Check general components
3033
ClassifyMain.people_also_ask, # Check people also ask
@@ -114,6 +117,15 @@ def images(cmpt: bs4.element.Tag) -> str:
114117
]
115118
return 'images' if any(conditions) else "unknown"
116119

120+
@staticmethod
121+
def ai_overview(cmpt: bs4.element.Tag) -> str:
122+
"""Classify AI Overview components"""
123+
conditions = [
124+
cmpt.find("div", {"class": "Fzsovc"}),
125+
cmpt.find("h2") and cmpt.find("h2").get_text(strip=True) == "AI Overview",
126+
]
127+
return 'knowledge' if any(conditions) else "unknown"
128+
117129
@staticmethod
118130
def knowledge_block(cmpt: bs4.element.Tag) -> str:
119131
"""Classify knowledge block components"""
@@ -133,7 +145,7 @@ def knowledge_box(cmpt: bs4.element.Tag) -> str:
133145
bool(cmpt.find("div", {"jscontroller": "Z2bSc"}))
134146
)
135147
condition['maps'] = webutils.check_dict_value(attrs, "data-hveid", "CAMQAA")
136-
condition['hotels'] = cmpt.find("div", {"class": "zd2Jbb"})
148+
condition['locations'] = cmpt.find("div", {"class": "zd2Jbb"})
137149
condition['events'] = cmpt.find("g-card", {"class": "URhAHe"})
138150
condition['jobs'] = cmpt.find("g-card", {"class": "cvoI5e"})
139151
text_list = list(cmpt.stripped_strings)
@@ -176,6 +188,24 @@ def people_also_ask(cmpt: bs4.element.Tag) -> str:
176188
conditions = webutils.check_dict_value(cmpt.attrs, "class", class_list)
177189
return 'people_also_ask' if conditions else "unknown"
178190

191+
@staticmethod
192+
def short_videos(cmpt: bs4.element.Tag) -> str:
193+
"""Classify short videos carousel"""
194+
heading = cmpt.find('span', {'role': 'heading', 'class': 'IFnjPb'})
195+
if heading and heading.get_text(strip=True) == 'Short videos':
196+
return 'short_videos'
197+
return "unknown"
198+
199+
@staticmethod
200+
def locations(cmpt: bs4.element.Tag) -> str:
201+
"""Classify locations components (hotels, etc.)"""
202+
heading = cmpt.find(attrs={'role': 'heading'})
203+
if heading:
204+
text = heading.get_text(strip=True)
205+
if text.startswith('Hotels') or text.startswith('More Hotels'):
206+
return 'locations'
207+
return "unknown"
208+
179209
@staticmethod
180210
def top_stories(cmpt: bs4.element.Tag) -> str:
181211
"""Classify top stories components"""

WebSearcher/component_parsers/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@
1818
from .recent_posts import parse_recent_posts
1919

2020
from .local_results import parse_local_results
21+
from .locations import parse_locations
2122
from .map_results import parse_map_results
2223
from .news_quotes import parse_news_quotes
2324
from .people_also_ask import parse_people_also_ask
2425
from .scholarly_articles import parse_scholarly_articles
2526
from .searches_related import parse_searches_related
27+
from .short_videos import parse_short_videos
2628
from .shopping_ads import parse_shopping_ads
2729
from .twitter_cards import parse_twitter_cards
2830
from .twitter_result import parse_twitter_result
@@ -54,13 +56,15 @@
5456
('latest_from', parse_latest_from, 'Latest From'),
5557
('local_news', parse_local_news, 'Local News'),
5658
('local_results', parse_local_results, 'Local Results'),
59+
('locations', parse_locations, 'Locations'),
5760
('map_results', parse_map_results, 'Map Results'),
5861
('news_quotes', parse_news_quotes, 'News Quotes'),
5962
('people_also_ask', parse_people_also_ask, 'People Also Ask'),
6063
('perspectives', parse_perspectives, 'Perspectives & Opinions'),
6164
('recent_posts', parse_recent_posts, 'Recent Posts'),
6265
('scholarly_articles', parse_scholarly_articles, 'Scholar Articles'),
6366
('searches_related', parse_searches_related, 'Related Searches'),
67+
('short_videos', parse_short_videos, 'Short Videos'),
6468
('shopping_ads', parse_shopping_ads, 'Shopping Ad'),
6569
('top_stories', parse_top_stories, 'Top Stories'),
6670
('twitter_cards', parse_twitter_cards, 'Twitter Cards'),

WebSearcher/component_parsers/ads.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,8 @@ def parse_ad_menu(sub: bs4.element.Tag) -> list:
183183
"""Parse menu items for a large ad with additional subresults"""
184184

185185
parsed_items = DetailsList()
186+
187+
# Format 1: MhgNwc items with MUxGbd sub-divs
186188
menu_items = sub.find_all('div', {'class': 'MhgNwc'})
187189
for item in menu_items:
188190
parsed_item = DetailsItem()
@@ -194,6 +196,17 @@ def parse_ad_menu(sub: bs4.element.Tag) -> list:
194196
else:
195197
parsed_item.text = webutils.get_text(div) or ''
196198
parsed_items.append(parsed_item)
199+
200+
# Format 2: bOeY0b sitelinks section
201+
if not parsed_items:
202+
sitelink_div = sub.find('div', {'class': 'bOeY0b'})
203+
if sitelink_div:
204+
for link in sitelink_div.find_all('a', href=True):
205+
text = link.get_text(strip=True)
206+
href = link.get('href', '')
207+
if text and href:
208+
parsed_items.append(DetailsItem(url=href, title=text))
209+
197210
return parsed_items.to_dicts()
198211

199212

WebSearcher/component_parsers/discussions_and_forums.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,28 +11,50 @@
1111
('div', {'class': 'VZGVuc'}),
1212
]
1313

14+
SUB_SELECTORS = [
15+
("div", {"class": "LJ7wUe"}),
16+
("div", {"class": "JlqpRe"}),
17+
("div", {"class": "EDblX"}),
18+
]
19+
1420

15-
def parse_discussions_and_forums(cmpt:bs4.element.Tag) -> list:
21+
def parse_discussions_and_forums(cmpt: bs4.element.Tag) -> list:
1622
"""Parse a 'Discussions and forums' component"""
17-
subs = cmpt.find_all("div", {"class":"LJ7wUe"})
18-
parsed_list = [parse_discussions_and_forums_item(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
19-
return parsed_list
23+
for tag, attrs in SUB_SELECTORS:
24+
subs = cmpt.find_all(tag, attrs)
25+
if subs:
26+
return [parse_item(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
27+
return []
28+
2029

21-
def parse_discussions_and_forums_item(cmpt:bs4.element.Tag, sub_rank:int = 0) -> dict:
30+
def parse_item(cmpt: bs4.element.Tag, sub_rank: int = 0) -> dict:
2231
"""Parse a 'Discussions and forums' subcomponent"""
2332
return {
2433
"type": "discussions_and_forums",
2534
"sub_type": None,
2635
"sub_rank": sub_rank,
27-
"title": webutils.get_text_by_selectors(cmpt, TITLE_SELECTORS),
36+
"title": get_title(cmpt),
2837
"url": get_url(cmpt),
29-
"cite": webutils.get_text_by_selectors(cmpt, CITE_SELECTORS)
38+
"cite": get_cite(cmpt),
3039
}
3140

41+
42+
def get_title(sub):
43+
"""Get title from selectors or heading div"""
44+
title = webutils.get_text_by_selectors(sub, TITLE_SELECTORS)
45+
if not title:
46+
title = webutils.get_text(sub, 'div', {'role': 'heading'})
47+
return title
48+
49+
50+
def get_cite(sub):
51+
"""Get cite from selectors"""
52+
return webutils.get_text_by_selectors(sub, CITE_SELECTORS)
53+
54+
3255
def get_url(sub):
3356
"""Get URL from a subcomponent; try multiple, take first non-null"""
34-
url_list = [webutils.get_link(sub, {"class":"v4kUNc"}),
57+
url_list = [webutils.get_link(sub, {"class": "v4kUNc"}),
3558
webutils.get_link(sub)]
3659
url_list = [url for url in url_list if url]
3760
return url_list[0] if url_list else None
38-

WebSearcher/component_parsers/knowledge.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,12 +79,19 @@ def parse_knowledge_panel(cmpt, sub_rank=0) -> list:
7979
):
8080
parsed['sub_type'] = 'finance'
8181

82-
elif cmpt.find('div', {'role':'button'}) and cmpt.find('div', {'role':'button'}).text == 'Dictionary':
82+
elif (
83+
cmpt.find('div', {'data-attrid': 'DictionaryHeader'}) or
84+
(cmpt.find('div', {'role':'button'}) and cmpt.find('div', {'role':'button'}).text == 'Dictionary')
85+
):
8386
parsed['sub_type'] = 'dictionary'
84-
span_first = cmpt.find('span', {'jsslot':''})
85-
if span_first:
86-
span = span_first.find_all('span')
87-
details['text'] = get_text(span).split('Translate')[0] if span else None
87+
vmod = cmpt.find('div', {'class': 'vmod'})
88+
if vmod:
89+
details['text'] = vmod.get_text(' ', strip=True).split('Translate')[0]
90+
else:
91+
span_first = cmpt.find('span', {'jsslot':''})
92+
if span_first:
93+
span = span_first.find_all('span')
94+
details['text'] = get_text(span).split('Translate')[0] if span else None
8895

8996
elif (
9097
cmpt.find('h2') and cmpt.find('h2').text == 'Translation Result' or
@@ -102,6 +109,12 @@ def parse_knowledge_panel(cmpt, sub_rank=0) -> list:
102109
span = cmpt.find_all(['span'])
103110
details['text'] = get_text(span) if span else None
104111

112+
elif cmpt.find('span', {'role': 'heading', 'class': 'IFnjPb'}):
113+
heading_span = cmpt.find('span', {'role': 'heading', 'class': 'IFnjPb'})
114+
if heading_span and heading_span.text.strip() in ('Things to know', 'Cosas que debes saber'):
115+
parsed['sub_type'] = 'things_to_know'
116+
details['heading'] = heading_span.text.strip()
117+
105118
else:
106119
parsed['sub_type'] = 'panel'
107120
div = cmpt.find_all(['span','div','a'], string=True)
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""Parser for locations components (hotels, etc.)"""
2+
3+
import bs4
4+
from .. import webutils
5+
6+
7+
def parse_locations(cmpt: bs4.element.Tag) -> list:
8+
"""Parse a locations component (e.g. hotel listings)"""
9+
10+
sub_type = classify_locations_sub_type(cmpt)
11+
if sub_type == 'hotels':
12+
return parse_hotels(cmpt)
13+
return [{'type': 'locations', 'sub_rank': 0, 'error': f'unknown sub_type: {sub_type}'}]
14+
15+
16+
def classify_locations_sub_type(cmpt: bs4.element.Tag) -> str:
17+
"""Classify the sub-type of a locations component"""
18+
heading = cmpt.find(attrs={'role': 'heading'})
19+
if heading:
20+
text = heading.get_text(strip=True)
21+
if 'Hotels' in text or 'Hotel' in text:
22+
return 'hotels'
23+
# Check for /travel/ links as fallback
24+
if cmpt.find('a', href=lambda h: h and '/travel/' in h):
25+
return 'hotels'
26+
return 'unknown'
27+
28+
29+
def parse_hotels(cmpt: bs4.element.Tag) -> list:
30+
"""Parse hotel items from a locations component"""
31+
32+
items = []
33+
for a in cmpt.find_all('a', href=True):
34+
href = a.get('href', '')
35+
if '/travel/' not in href:
36+
continue
37+
name_div = a.find('div', {'class': 'sxdlOc'}) or a.find('div', {'class': 'BTPx6e'})
38+
if not name_div:
39+
continue
40+
items.append(_parse_hotel_item(a, len(items)))
41+
42+
if not items:
43+
return [{'type': 'locations', 'sub_type': 'hotels', 'sub_rank': 0,
44+
'error': 'no hotel items found'}]
45+
return items
46+
47+
48+
def _parse_hotel_item(a: bs4.element.Tag, sub_rank: int) -> dict:
49+
"""Parse a single hotel item from an anchor tag"""
50+
name_div = a.find('div', {'class': 'sxdlOc'}) or a.find('div', {'class': 'BTPx6e'})
51+
price_span = a.find('span', {'class': 'sRlU8b'})
52+
rating_span = a.find('span', {'class': 'yi40Hd'})
53+
reviews_span = a.find('span', {'class': 'RDApEe'})
54+
stars_span = a.find('span', {'class': 'NAkmnc'})
55+
desc_div = a.find('div', {'class': 'S7Ajc'})
56+
57+
return {
58+
'type': 'locations',
59+
'sub_type': 'hotels',
60+
'sub_rank': sub_rank,
61+
'title': name_div.get_text(strip=True) if name_div else None,
62+
'url': a.get('href'),
63+
'text': desc_div.get_text(strip=True) if desc_div else None,
64+
'cite': None,
65+
'details': _parse_hotel_details(price_span, rating_span, reviews_span, stars_span),
66+
}
67+
68+
69+
def _parse_hotel_details(price_span, rating_span, reviews_span, stars_span) -> dict:
70+
"""Extract hotel metadata"""
71+
details = {}
72+
if price_span:
73+
details['price'] = price_span.get_text(strip=True)
74+
if rating_span:
75+
details['rating'] = rating_span.get_text(strip=True)
76+
if reviews_span:
77+
details['reviews'] = reviews_span.get_text(strip=True)
78+
if stars_span:
79+
details['stars'] = stars_span.get_text(strip=True)
80+
return details if details else None

0 commit comments

Comments
 (0)