From 7ad3b92a1af1e152281ba39c98ef4657b0411cc5 Mon Sep 17 00:00:00 2001 From: Tom Schulze Date: Mon, 5 Sep 2022 15:47:59 +0200 Subject: [PATCH] refactor parsing responses --- CHANGELOG.md | 4 + src/parsers/AccountVersion1Parser.php | 42 +++++ src/parsers/Parser.php | 41 +++++ src/parsers/TagVersion1Parser.php | 47 ++++++ src/parsers/TagVersion2Parser.php | 53 ++++++ src/parsers/TagVersion3Parser.php | 44 +++++ src/services/InstagramService.php | 229 ++++++++++---------------- src/structures.php | 20 +++ 8 files changed, 340 insertions(+), 140 deletions(-) create mode 100644 src/parsers/AccountVersion1Parser.php create mode 100644 src/parsers/Parser.php create mode 100644 src/parsers/TagVersion1Parser.php create mode 100644 src/parsers/TagVersion2Parser.php create mode 100644 src/parsers/TagVersion3Parser.php create mode 100644 src/structures.php diff --git a/CHANGELOG.md b/CHANGELOG.md index b5e28f8..488fae7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ - Set referrer in proxy requests. - Set plugin version in proxy requests. +### Changed + +- Refactored parsers to generalize it more and make it easier to adapt to new structures in the future. + ## 2.0.1 - 2022-06-03 [CRITICAL] > {note} Instagram has changed the data structure on 06/01/2022. Without this update, the plugin will no longer work. diff --git a/src/parsers/AccountVersion1Parser.php b/src/parsers/AccountVersion1Parser.php new file mode 100644 index 0000000..6cbbe9c --- /dev/null +++ b/src/parsers/AccountVersion1Parser.php @@ -0,0 +1,42 @@ +getBestPicture($media['node']['thumbnail_resources']); + $item['imageSource'] = $media['node']['display_url']; + $item['likes'] = $media['node']['edge_liked_by']['count'] ?? 0; + $item['comments'] = $media['node']['edge_media_to_comment']['count'] ?? 0; + $item['shortcode'] = $media['node']['shortcode']; + $item['timestamp'] = $media['node']['taken_at_timestamp']; + $item['caption'] = $media['node']['edge_media_to_caption']['edges'][0]['node']['text'] ?? ''; + $item['isVideo'] = (bool)$media['node']['is_video']; + if ($item['isVideo']) { + $item['hasAudio'] = isset($media['node']['has_audio']) && $media['node']['has_audio']; + $item['video_view_count'] = $media['node']['video_view_count'] ?? 0; + } + $items[] = $item; + } + + return $items; + } + + protected function getPictureMapping(): array + { + return [ + 'width' => 'config_width', + 'height' => 'config_height', + 'url' => 'src', + ]; + } +} \ No newline at end of file diff --git a/src/parsers/Parser.php b/src/parsers/Parser.php new file mode 100644 index 0000000..44fd2dc --- /dev/null +++ b/src/parsers/Parser.php @@ -0,0 +1,41 @@ +getPictureMapping(); + + foreach ($pictures as $picture) { + if (!isset($picture[$mapping['width']])) { + throw new \yii\base\Exception('hier'); + } + $pixels = $picture[$mapping['width']] * $picture[$mapping['height']]; + if ($pixels > $maxPixels) { + $url = $picture[$mapping['url']]; + + $maxPixels = $pixels; + } + } + + return $url; + } +} \ No newline at end of file diff --git a/src/parsers/TagVersion1Parser.php b/src/parsers/TagVersion1Parser.php new file mode 100644 index 0000000..5dc405a --- /dev/null +++ b/src/parsers/TagVersion1Parser.php @@ -0,0 +1,47 @@ +getBestPicture($node['media']['carousel_media'][0]['image_versions2']['candidates'], $version); + } else { + $item['thumbnailSource'] = $this->getBestPicture($node['media']['image_versions2']['candidates'], $version); + } + $item['imageSource'] = $item['thumbnailSource']; + $item['likes'] = $node['media']['like_count'] ?? 0; + $item['comments'] = $node['media']['comment_count'] ?? 0; + $item['shortcode'] = $node['media']['code']; + $item['timestamp'] = $node['media']['taken_at']; + $item['caption'] = $node['media']['caption']['text'] ?? ''; + $item['isVideo'] = (int)$node['media']['media_type'] === 2; + if ($item['isVideo']) { + $item['hasAudio'] = isset($node['media']['has_audio']) && $node['media']['has_audio']; + } + $item['video_view_count'] = $node['media']['video_view_count'] ?? 0; + $items[] = $item; + } + } + + return []; + } + + protected function getPictureMapping(): array + { + return [ + 'width' => 'width', + 'height' => 'height', + 'url' => 'url', + ]; + } +} \ No newline at end of file diff --git a/src/parsers/TagVersion2Parser.php b/src/parsers/TagVersion2Parser.php new file mode 100644 index 0000000..c88de7e --- /dev/null +++ b/src/parsers/TagVersion2Parser.php @@ -0,0 +1,53 @@ +getBestPicture($node['media']['carousel_media'][0]['image_versions2']['candidates']); + } else { + $item['thumbnailSource'] = $this->getBestPicture($node['media']['image_versions2']['candidates']); + } + $item['imageSource'] = $item['thumbnailSource']; + $item['likes'] = $node['media']['like_count'] ?? 0; + $item['comments'] = $node['media']['comment_count'] ?? 0; + $item['shortcode'] = $node['media']['code']; + $item['timestamp'] = $node['media']['taken_at']; + $item['caption'] = $node['media']['caption']['text'] ?? ''; + $item['isVideo'] = (int)$node['media']['media_type'] === 2; + if ($item['isVideo']) { + $item['hasAudio'] = isset($node['media']['has_audio']) && $node['media']['has_audio']; + $item['video_view_count'] = $node['media']['video_view_count'] ?? 0; + } + $items[] = $item; + } + } + + return $items; + } + + protected function getPictureMapping(): array + { + return [ + 'width' => 'width', + 'height' => 'height', + 'url' => 'url', + ]; + } +} \ No newline at end of file diff --git a/src/parsers/TagVersion3Parser.php b/src/parsers/TagVersion3Parser.php new file mode 100644 index 0000000..847627c --- /dev/null +++ b/src/parsers/TagVersion3Parser.php @@ -0,0 +1,44 @@ +getBestPicture($media['node']['thumbnail_resources']); + $item['imageSource'] = $media['node']['display_url']; + $item['likes'] = $media['node']['edge_liked_by']['count'] ?? 0; + $item['comments'] = $media['node']['edge_media_to_comment']['count'] ?? 0; + $item['shortcode'] = $media['node']['shortcode']; + $item['timestamp'] = $media['node']['taken_at_timestamp']; + $item['caption'] = $media['node']['edge_media_to_caption']['edges'][0]['node']['text'] ?? ''; + $item['isVideo'] = (bool)$media['node']['is_video']; + if ($item['isVideo']) { + $item['hasAudio'] = isset($media['node']['has_audio']) && $media['node']['has_audio']; + $item['video_view_count'] = $media['node']['video_view_count'] ?? 0; + } + $items[] = $item; + } + + return $items; + } + + protected function getPictureMapping(): array + { + return [ + 'width' => 'config_width', + 'height' => 'config_height', + 'url' => 'src', + ]; + } +} \ No newline at end of file diff --git a/src/services/InstagramService.php b/src/services/InstagramService.php index 58e693d..e243c4f 100644 --- a/src/services/InstagramService.php +++ b/src/services/InstagramService.php @@ -16,10 +16,6 @@ class InstagramService extends Component { - public const STRUCTURE_VERSION_1 = 1; - - public const STRUCTURE_VERSION_2 = 2; - public const CACHE_TAG = 'instagramfeed'; private const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36'; @@ -84,10 +80,10 @@ public function getFeed(string $accountOrTag = null): array } if (!empty($cachedItems)) { - // If not updated expand cache time and set update to 15min to stop from retrying every request + // If not updated expand cache time and set update to six hours to stop from retrying every request Craft::info('Error fetching new data from Instagram, using existing cached data and expanding cache time. Stopping requests for 15 minutes.', 'instagramfeed'); $cacheService->set('instagram_data_' . $hash, $cachedItems, 2592000, $dependency); - $cacheService->set('instagram_update_error_' . $hash, true, 900, $dependency); + $cacheService->set('instagram_update_error_' . $hash, true, 21600, $dependency); } if (empty($cachedItems)) { @@ -112,9 +108,8 @@ public function getFeed(string $accountOrTag = null): array * @param string $account The account name to fetch. * * @return array - * @throws GuzzleException - * @throws \yii\base\ErrorException - * @throws \yii\base\Exception + * @throws \GuzzleHttp\Exception\GuzzleException + * @throws \craft\errors\SiteNotFoundException */ private function getInstagramAccountData(string $account): array { @@ -131,26 +126,14 @@ private function getInstagramAccountData(string $account): array if (false === $obj) { return []; } - - return $this->flattenMediaArray($obj['data']['user']['edge_owner_to_timeline_media']['edges'], self::STRUCTURE_VERSION_1); - } - - $obj = $this->parseInstagramResponse($html); - if (empty($obj)) { - return []; - } - - if (!array_key_exists('ProfilePage', $obj['entry_data'])) { - if (stripos($html, 'welcome back to instagram') !== false) { - Craft::error('Instagram account data could not be fetched. It seems that your IP address has been blocked by Instagram. See https://github.com/codemonauts/craft-instagram-feed/issues/32', 'instagramfeed'); - } else { - Craft::error('Instagram account data could not be fetched. Maybe the site structure has changed.', 'instagramfeed'); + } else { + $obj = $this->parseInstagramResponse($html); + if (empty($obj)) { + return []; } - - return []; } - return $this->flattenMediaArray($obj['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges'], self::STRUCTURE_VERSION_1); + return $this->extractMedia($obj); } /** @@ -159,8 +142,8 @@ private function getInstagramAccountData(string $account): array * @param string $tag The tag name to fetch. * * @return array - * @throws \yii\base\ErrorException - * @throws \yii\base\Exception|\GuzzleHttp\Exception\GuzzleException + * @throws \GuzzleHttp\Exception\GuzzleException + * @throws \craft\errors\SiteNotFoundException */ private function getInstagramTagData(string $tag): array { @@ -180,64 +163,14 @@ private function getInstagramTagData(string $tag): array if (false === $obj) { return []; } - - return $this->flattenMediaArray($obj['data']['recent']['sections'], self::STRUCTURE_VERSION_2); - } - - $obj = $this->parseInstagramResponse($html); - if (empty($obj)) { - return []; - } - - if (!array_key_exists('TagPage', $obj['entry_data'])) { - if (stripos($html, 'welcome back to instagram') !== false) { - Craft::error('Instagram tag data could not be fetched. It seems that your IP address has been blocked by Instagram. See https://github.com/codemonauts/craft-instagram-feed/issues/32', 'instagramfeed'); - } else { - Craft::error('Instagram tag data could not be fetched. Maybe the site structure has changed.', 'instagramfeed'); - } - - return []; - } - - if (isset($obj['entry_data']['TagPage'][0]['graphql'])) { - return $this->flattenMediaArray($obj['entry_data']['TagPage'][0]['graphql']['hashtag']['edge_hashtag_to_media']['edges'], self::STRUCTURE_VERSION_1); - } - - return $this->flattenMediaArray($obj['entry_data']['TagPage'][0]['data']['recent']['sections'], self::STRUCTURE_VERSION_2); - } - - /** - * Returns the best picture in size from the Instagram result array. - * - * @param array $pictures The array of pictures to choose the best version from. - * @param int $version The structure's version - * - * @return string - */ - private function getBestPicture(array $pictures, int $version): string - { - $url = ''; - $maxPixels = 0; - - foreach ($pictures as $picture) { - if ($version === self::STRUCTURE_VERSION_1) { - $pixels = $picture['config_width'] * $picture['config_height']; - if ($pixels > $maxPixels) { - $url = $picture['src']; - - $maxPixels = $pixels; - } - } else if ($version === self::STRUCTURE_VERSION_2) { - $pixels = $picture['width'] * $picture['height']; - if ($pixels > $maxPixels) { - $url = $picture['url']; - - $maxPixels = $pixels; - } + } else { + $obj = $this->parseInstagramResponse($html); + if (empty($obj)) { + return []; } } - return $url; + return $this->extractMedia($obj); } /** @@ -247,6 +180,7 @@ private function getBestPicture(array $pictures, int $version): string * * @return string|null * @throws \GuzzleHttp\Exception\GuzzleException + * @throws \craft\errors\SiteNotFoundException */ private function fetchInstagramPage(string $path): ?string { @@ -288,8 +222,12 @@ private function fetchInstagramPage(string $path): ?string * * @return mixed */ - private function parseProxyResponse(string $response) + private function parseProxyResponse(string $response): mixed { + if (InstagramFeed::$settings->dump) { + $this->dumpResponse($response); + } + return json_decode($response, true); } @@ -299,94 +237,85 @@ private function parseProxyResponse(string $response) * @param string $response Response body from Instagram * * @return array - * @throws \yii\base\ErrorException - * @throws \yii\base\Exception */ private function parseInstagramResponse(string $response): array { - if (InstagramFeed::$settings->dump) { - $timestamp = time(); - $path = Craft::$app->path->getStoragePath() . '/runtime/instagramfeed'; - FileHelper::writeToFile($path . '/' . $timestamp, $response); - Craft::info('Wrote Instagram response to ' . $path . '/' . $timestamp); - } - $arr = explode('window._sharedData = ', $response); if (!isset($arr[1])) { // Check if Instagram returned a statement and not a valid page - $response = json_decode($response, false); - if (isset($response->errors)) { - Craft::error('Instagram responsed with an error: ' . implode(' ', $response->errors->error), 'instagramfeed'); + $statement = json_decode($response, false); + if (isset($statement->errors)) { + Craft::error('Instagram responsed with an error: ' . implode(' ', $statement->errors->error), 'instagramfeed'); } else { Craft::error('Unknown response from Instagram. Please check debug output in devMode.', 'instagramfeed'); } + $this->dumpResponse($response); return []; } + if (InstagramFeed::$settings->dump) { + $this->dumpResponse($response); + } + $arr = explode(';', $arr[1]); return json_decode($arr[0], true); } /** - * Function to flatten the Instagram response to simple array + * Extracts the posts from the Instagram response * - * @param array $mediaArray The Instagram response array - * @param int $version The structure's version + * @param array $response The response from Instagram * * @return array */ - private function flattenMediaArray(array $mediaArray, int $version): array + private function extractMedia(array $response): array { $items = []; - if ($version === self::STRUCTURE_VERSION_1) { - foreach ($mediaArray as $media) { - $item['thumbnailSource'] = $this->getBestPicture($media['node']['thumbnail_resources'], $version); - $item['imageSource'] = $media['node']['display_url']; - $item['likes'] = $media['node']['edge_liked_by']['count'] ?? 0; - $item['comments'] = $media['node']['edge_media_to_comment']['count'] ?? 0; - $item['shortcode'] = $media['node']['shortcode']; - $item['timestamp'] = $media['node']['taken_at_timestamp']; - $item['caption'] = $media['node']['edge_media_to_caption']['edges'][0]['node']['text'] ?? ''; - $item['isVideo'] = (bool)$media['node']['is_video']; - if ($item['isVideo']) { - $item['hasAudio'] = isset($media['node']['has_audio']) && $media['node']['has_audio']; - } - $item['video_view_count'] = $media['node']['video_view_count'] ?? 0; - $items[] = $item; - } - } else if ($version === self::STRUCTURE_VERSION_2) { - foreach ($mediaArray as $section) { - foreach ($section['layout_content']['medias'] as $node) { - if ((int)$node['media']['media_type'] === 8) { - if (!isset($node['media']['carousel_media'][0]['image_versions2'])) { - continue; - } - $item['thumbnailSource'] = $this->getBestPicture($node['media']['carousel_media'][0]['image_versions2']['candidates'], $version); - } else { - $item['thumbnailSource'] = $this->getBestPicture($node['media']['image_versions2']['candidates'], $version); - } - $item['imageSource'] = $item['thumbnailSource']; - $item['likes'] = $node['media']['like_count'] ?? 0; - $item['comments'] = $node['media']['comment_count'] ?? 0; - $item['shortcode'] = $node['media']['code']; - $item['timestamp'] = $node['media']['taken_at']; - $item['caption'] = $node['media']['caption']['text'] ?? ''; - $item['isVideo'] = (int)$node['media']['media_type'] === 2; - if ($item['isVideo']) { - $item['hasAudio'] = isset($node['media']['has_audio']) && $node['media']['has_audio']; - } - $item['video_view_count'] = $node['media']['video_view_count'] ?? 0; - $items[] = $item; - } + $structures = include(__DIR__ . '/../structures.php'); + + foreach ($structures as $config) { + if ($this->isStructure($response, $config['structure'])) { + $parser = new $config['parser']; + return $parser->getItems($response); } } + // No known structure found, if $response is not empty, we will dump it + if (!empty($response)) { + $this->dumpResponse(serialize($response)); + } + return $items; } + /** + * Checks if an array matches to a specific structure + * + * @param array $haystack The haystack to check for. + * @param string|array $structure The structure to check against. + * + * @return bool + */ + private function isStructure(array $haystack, string|array $structure): bool + { + if (is_string($structure)) { + $structure = explode('.', $structure); + } + + if (empty($structure)) { + return true; + } + $node = array_shift($structure); + if (!isset($haystack[$node])) { + return false; + } + + return $this->isStructure($haystack[$node], $structure); + } + /** * Download and store images * @@ -532,6 +461,26 @@ private function populateImages(array $items): array return $items; } + + /** + * Write the response to a file in the storage folder. + * + * @param $response + * + * @return void + */ + private function dumpResponse($response): void + { + try { + $timestamp = time(); + $path = Craft::$app->path->getStoragePath() . '/runtime/instagramfeed'; + FileHelper::writeToFile($path . '/' . $timestamp, $response); + Craft::info('Wrote Instagram response to ' . $path . '/' . $timestamp); + } catch (Exception $e) { + Craft::error('Cannot write Instagram response to ' . $path . '/' . $timestamp . ': ' . $e->getMessage()); + } + } + /** * Whether the plugin can use the proxy. * diff --git a/src/structures.php b/src/structures.php new file mode 100644 index 0000000..31be326 --- /dev/null +++ b/src/structures.php @@ -0,0 +1,20 @@ + 'data.user.edge_owner_to_timeline_media.edges', + 'parser' => codemonauts\instagramfeed\parsers\AccountVersion1Parser::class, + ], + [ + 'structure' => 'entry_data.TagPage.0.graphql.hashtag.edge_hashtag_to_media.edges', + 'parser' => codemonauts\instagramfeed\parsers\TagVersion1Parser::class, + ], + [ + 'structure' => 'data.recent.sections', + 'parser' => codemonauts\instagramfeed\parsers\TagVersion2Parser::class, + ], + [ + 'structure' => 'data.hashtag.edge_hashtag_to_media.edges', + 'parser' => codemonauts\instagramfeed\parsers\TagVersion3Parser::class, + ], +]; \ No newline at end of file