-
Notifications
You must be signed in to change notification settings - Fork 5
/
imdb_scraper.php
185 lines (151 loc) · 5.36 KB
/
imdb_scraper.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
<?php
/*
IMDb Scraper v. 1.0 - 14th of September, 2011
Scrapes information about movie and tv show titles from IMDb (imdb.com).
By Aram Kocharyan
http://ak.net84.net/php/imdb-scraper/
twitter.com/akarmenia
*/
// Utility functions
require_once('util.php');
// Prevent timeout
set_time_limit(0);
ini_set('max_execution_time', 0);
Class IMDbScraper {
// Performs an IMDb search and returns the info for the best match using the given query title and year
public static function get($title, $year = NULL) {
if ( ($result = self::find($title, $year = NULL)) !== FALSE ) {
return self::info($result['id']);
} else {
return FALSE;
}
}
// Return array of info for a given IMDb id string. eg. 'tt0206512'
public static function info($id) {
if (!is_string($id)) {
throw new Exception("The title must be a string");
} else {
$id = preg_replace('#[^t\d]#', '', $id);
}
$url = 'http://www.imdb.com/title/' . $id . '/';
if ( ($html = curl_get_html($url)) !== FALSE ) {
$info = self::scrape_info($html);
$info['id'] = $id;
$info['url'] = $url;
return $info;
} else {
return FALSE;
}
}
// Returns the list of IMDb search results for the given title query.
function search($title) {
if ( !is_string($title) ) {
throw new Exception("The title '".$title."' is not valid");
}
$url = 'http://www.imdb.com/find?s=tt&q=' . urlencode($title);
$html = curl_get_html($url);
return self::scrape_search($html);
}
// Performs an IMDb search and finds the best match to the given title and year.
function find($title, $year = NULL) {
if ( !is_string($title) || empty($title) ) {
throw new Exception("The title is not valid");
}
$query = $title;
if ( is_string($year) ) {
$year = intval($year);
}
if ( is_int($year) ) {
$query .= ' ' . $year;
}
// Get results for the search query
$results = self::search($query);
if ( empty($results) ) {
return FALSE;
}
// Remove any queries that don't match the year
if ($year !== NULL) {
$subset = array();
foreach ($results as $r) {
if ( intval($r[2]) == $year ) {
// Add result into subset, year matches
$subset[] = $r;
}
}
}
// If no year is provided, or it was and we were left with no results, use the original results
if ($year === NULL || empty($subset)) {
$subset = $results;
}
// Break title query into words
$query_bits = explode(' ', $title);
// Get the search result titles
$titles = array();
foreach ($results as $r) {
$titles[] = $r[1];
}
// Run a search using the words and see how many matches each search result gets
$counts = substr_count_arrays($titles, $query_bits);
// TODO check the results and see if the counts are equal (no good matches)
// Get the highest count, or if they are all equal use the first result
$highest_index = 0;
$highest_count = $counts[0];
for ($i = 1; $i < count($counts); $i++) {
if ($counts[$i] > $highest_count) {
$highest_index = $i;
}
}
// Create an associative array, now that we have our result
$result['id'] = $subset[$highest_index][0];
$result['title'] = $subset[$highest_index][1];
$result['year'] = $subset[$highest_index][2];
return $result;
}
// Returns an associative array of IMDb information scrapped from an HTML string.
public static function scrape_info($html) {
$result = array();
$result['name'] = regex_get('#<h1.*?>(.*?)<span#msi', $html, 1);
$result['desc'] = regex_get('#"description">(.*?)</p>#msi', $html, 1);
$date = regex_get('#datetime="(\d+)#msi', $html, 1, 'num');
if (empty($date)) {
$date = clean_num(regex_get('#<title>[^\(]*\(([^\)]+)\)#msi', $html, 1, 'num'));
}
$result['date'] = $date;
$result['duration'] = regex_get('#class="absmiddle"[^<]*?(\d+\s*min)#msi', $html, 1);
// Only for Movies
$result['director'] = regex_get('#writer.*?([\s\w]*)</a#msi', $html, 1);
$result['writer'] = regex_get('#writer.*?([\s\w]*)</a#msi', $html, 1);
// Only for TV shows
$result['creator'] = regex_get('#creator.*?([\s\w]*)</a#msi', $html, 1);
$result['cast'] = array();
if (preg_match_all('#class="name".*?>([^<]*)</a>#msi', $html, $cast)) {
$result['cast'] = $cast[1];
}
$result['genres'] = array();
if (preg_match_all('#/genre/([^"]*)"\s*>\1#msi', $html, $genre)) {
$result['genres'] = $genre[1];
}
$result['plot'] = regex_get('#storyline</h2>\s*<p>(.*?)<#msi', $html, 1);
$result['rating'] = regex_get('#"ratingValue">(.*?)<#msi', $html, 1, 'num');
$result['max-rating'] = regex_get('#"bestRating">(.*?)<#msi', $html, 1, 'num');
$result['voter-count'] = regex_get('#"ratingCount">(.*?)<#msi', $html, 1, 'num');
$result['user-review-count'] = regex_get('#"reviewCount">(.*?)<#msi', $html, 1, 'num');
$result['critic-review-count'] = regex_get('#(\d+) external critic#msi', $html, 1, 'num');
return $result;
}
// Returns an array of search results for the given HTML string of an IMDB search page.
// Each result is an array: (title ID, title, year)
public static function scrape_search($html) {
$results = array();
if (preg_match_all('#<a\s*href\s*=\s*"([^)]*?)"[^>]*?>([^<]*)</a>\s*\((\d*)\)#msi', $html, $matches)) {
for ($i = 0; $i < count($matches[0]); $i++) {
$results[$i] = array( imdb_url_id($matches[1][$i]),
clean_str($matches[2][$i]),
clean_str($matches[3][$i]) );
}
}
return $results;
}
}
?>