Skip to content
This repository was archived by the owner on Mar 1, 2024. It is now read-only.

Commit beacb50

Browse files
authored
Adding moview reviews link for IMDB loader (#630)
* Add files IMDB * Add files IMDB * Add to library json * Linting checks * Add to linting * Black linting * Import fixes * Readme and import os changes * linting via black * match id * dataframe to docs * Remove extra files * make dataframe optional * Add links for IMDB reviews
1 parent e4a265c commit beacb50

File tree

2 files changed

+24
-8
lines changed

2 files changed

+24
-8
lines changed

llama_hub/imdb_review/base.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,24 @@ def load_data(self) -> List[Document]:
2929
Returns:
3030
List[Document]: document object in llama index with date and rating as extra information
3131
"""
32-
reviews_date, reviews_title, reviews_comment, reviews_rating = main_scraper(
33-
self.movie_name_year, self.webdriver_engine, self.generate_csv
34-
)
32+
(
33+
reviews_date,
34+
reviews_title,
35+
reviews_comment,
36+
reviews_rating,
37+
reviews_link,
38+
) = main_scraper(self.movie_name_year, self.webdriver_engine, self.generate_csv)
3539

3640
all_docs = []
3741
for i in range(len(reviews_date)):
3842
all_docs.append(
3943
Document(
4044
text=reviews_title[i] + " " + reviews_comment[i],
41-
extra_info={"date": reviews_date[i], "rating": reviews_rating[i]},
45+
extra_info={
46+
"date": reviews_date[i],
47+
"rating": reviews_rating[i],
48+
"link": reviews_link[i],
49+
},
4250
)
4351
)
4452
return all_docs

llama_hub/imdb_review/scraper.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ def scrape_data(revs):
5555
contents (str): the review of the movie
5656
rating (str): The ratinng given by the user
5757
title (str): the title of the review
58+
link(str): the link of the review
5859
"""
5960

6061
try:
@@ -75,7 +76,10 @@ def scrape_data(revs):
7576
title = revs.find_element(By.CLASS_NAME, "title").text.strip()
7677
except NoSuchElementException:
7778
title = ""
78-
79+
try:
80+
link = revs.find_element(By.CLASS_NAME, "title").get_attribute("href")
81+
except NoSuchElementException:
82+
link = ""
7983
try:
8084
rating = revs.find_element(
8185
By.CLASS_NAME, "rating-other-user-rating"
@@ -87,7 +91,7 @@ def scrape_data(revs):
8791
contents.replace("//", "")
8892
date = revs.find_element(By.CLASS_NAME, "review-date").text
8993
contents = clean_text(contents)
90-
return date, contents, rating, title
94+
return date, contents, rating, title, link
9195

9296

9397
def main_scraper(
@@ -105,6 +109,7 @@ def main_scraper(
105109
reviews_title (List): list of title of each review
106110
reviews_comment (List): list of comment of each review
107111
reviews_rating (List): list of ratings of each review
112+
reviews_link (List): list of links of each review
108113
"""
109114
ia = imdb.Cinemagoer()
110115
movies = ia.search_movie(movie_name)
@@ -148,13 +153,15 @@ def main_scraper(
148153
reviews_comment = []
149154
reviews_rating = []
150155
reviews_title = []
156+
reviews_link = []
151157
for result in results:
152-
date, contents, rating, title = result
158+
date, contents, rating, title, link = result
153159
reviews_date.append(date)
154160

155161
reviews_comment.append(contents)
156162
reviews_rating.append(rating)
157163
reviews_title.append(title)
164+
reviews_link.append(link)
158165

159166
# driver.quit()
160167
if generate_csv:
@@ -167,8 +174,9 @@ def main_scraper(
167174
df["review_title"] = reviews_title
168175
df["review_comment"] = reviews_comment
169176
df["review_rating"] = reviews_rating
177+
df["review_link"] = reviews_link
170178

171179
# print(df)
172180
df.to_csv(f"movie_reviews/{movie_name}.csv", index=False)
173181

174-
return reviews_date, reviews_title, reviews_comment, reviews_rating
182+
return reviews_date, reviews_title, reviews_comment, reviews_rating, reviews_link

0 commit comments

Comments
 (0)