diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..4c1e37b --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,16 @@ +version: 2 +updates: + # GitHub Actions + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" + commit-message: + prefix: ⬆ + # Python + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "monthly" + commit-message: + prefix: ⬆ \ No newline at end of file diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..2afd810 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,32 @@ +name: Lint + +on: + push: + branches: + - master + pull_request: + types: [opened, synchronize] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Github actions init + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.9" + + - name: Update Pip + run: pip install --upgrade pip + + - name: Install Dependencies + run: pip install ruff mypy + + - name: Install + run: pip install -e . + + - name: Lint + run: bash scripts/lint.sh \ No newline at end of file diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..58c6ae4 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,33 @@ +name: Publish + +on: + workflow_dispatch: + release: + types: + - created + +jobs: + publish: + runs-on: ubuntu-latest + permissions: + id-token: write + steps: + - name: Github actions init + uses: actions/checkout@v4 + with: + # To force fetching tags + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.9" + + - name: Install build dependencies + run: pip install build + + - name: Build distribution + run: python -m build + + - name: Publish + uses: pypa/gh-action-pypi-publish@v1.12.2 \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..9382304 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,45 @@ +name: Release + +on: + workflow_dispatch: + +permissions: + contents: write + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Github actions init + uses: actions/checkout@v4 + with: + # To force fetching tags + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.9" + + - name: Install build dependencies + run: pip install build + + - name: Build + run: python -m build + + - name: Read VERSION file + id: getversion + run: echo "version=$(cat src/extract_favicon/VERSION.md)" >> $GITHUB_OUTPUT + + - name: Changelog + run: git log $(git describe --tags --abbrev=0)..HEAD --format="%s %h" > LATEST-CHANGES.md + + - name: Release + uses: softprops/action-gh-release@v2 + with: + files: | + dist/extract_favicon-${{ steps.getversion.outputs.version }}-py3-none-any.whl + dist/extract_favicon-${{ steps.getversion.outputs.version }}.tar.gz + tag_name: v${{ steps.getversion.outputs.version }} + body_path: LATEST-CHANGES.md + token: ${{ secrets.PAT_EXTRACT_FAVICON }} \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9f6c940 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.venv +__pycache__ +*.pyc +.ruff_cache +.mypy_cache +.pytest_cache diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a8fc851 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2016 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a583d4d --- /dev/null +++ b/README.md @@ -0,0 +1,24 @@ +# Extract Favicon + +`extract-favicon` is a Python library to find and extract the favicon of any website. + +## Installation + +```bash +pip install favicon-extract +``` + +## Usage + +```console +>>> import extract_favicon +>>> icons = extract_favicon.from_html(my_html, root_url="https://www.python.org/static/") +Icon(url='https://www.python.org/static/apple-touch-icon-144x144-precomposed.png', width=144, height=144, format='png') +Icon(url='https://www.python.org/static/apple-touch-icon-114x114-precomposed.png', width=114, height=114, format='png') +Icon(url='https://www.python.org/static/apple-touch-icon-72x72-precomposed.png', width=72, height=72, format='png') +Icon(url='https://www.python.org/static/apple-touch-icon-precomposed.png', width=0, height=0, format='png') +Icon(url='https://www.python.org/static/favicon.ico', width=0, height=0, format='ico') +``` + +## Inspiration +This library is an extension of the [favicon](https://github.com/scottwernervt/favicon/) package. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ee2c650 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,45 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "extract-favicon" +description = "Find and extract the favicon of any website" +dynamic = ["version"] +readme = "README.md" +authors = [{ name = "Alex Mili" }] +license = { file = "LICENSE" } +requires-python = ">=3.9" +classifiers = [ + "License :: OSI Approved :: MIT License", + "Intended Audience :: Developers", +] +keywords = [] +dependencies = [ + "bs4", + "pillow" +] + +[project.urls] +Homepage = "https://github.com/AlexMili/Extract_Favicon" +Issues = "https://github.com/AlexMili/Extract_Favicon/issues" +Repository = "https://github.com/AlexMili/Extract_Favicon" +Documentation = "https://github.com/AlexMili/Extract_Favicon" + + +[tool.hatch.build.targets.wheel] +packages = ["./src/extract_favicon/"] + +[tool.hatch.version] +path = "src/extract_favicon/VERSION.md" +pattern = "(?P.*)" + +[tool.ruff.lint.isort] +lines-after-imports = 2 +known-first-party = ["extract_favicon"] + +[tool.mypy] +strict = true +exclude = [".venv", "test", "build", "dist"] +ignore_missing_imports = true +show_error_codes = true diff --git a/scripts/lint.sh b/scripts/lint.sh new file mode 100755 index 0000000..684770a --- /dev/null +++ b/scripts/lint.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +set -e +set -x + +mypy src/extract_favicon +ruff check src/extract_favicon diff --git a/src/extract_favicon/VERSION.md b/src/extract_favicon/VERSION.md new file mode 100644 index 0000000..6c6aa7c --- /dev/null +++ b/src/extract_favicon/VERSION.md @@ -0,0 +1 @@ +0.1.0 \ No newline at end of file diff --git a/src/extract_favicon/__init__.py b/src/extract_favicon/__init__.py new file mode 100644 index 0000000..64a9482 --- /dev/null +++ b/src/extract_favicon/__init__.py @@ -0,0 +1,11 @@ +import os.path as osp + +from .main import from_html + + +__all__ = ["from_html"] + +version_path = osp.join(osp.dirname(__file__), "VERSION.md") +if osp.exists(version_path): + with open(version_path, "r") as f: + __version__ = f.readline() diff --git a/src/extract_favicon/main.py b/src/extract_favicon/main.py new file mode 100644 index 0000000..845e95b --- /dev/null +++ b/src/extract_favicon/main.py @@ -0,0 +1,187 @@ +import base64 +import io +import os +import re +from typing import NamedTuple, Optional, Tuple +from urllib.parse import urljoin, urlparse + +from bs4 import BeautifulSoup +from bs4.element import Tag +from PIL import Image + + +LINK_TAGS: list[str] = [ + "icon", + "shortcut icon", + "apple-touch-icon", + "apple-touch-icon-precomposed", +] + + +SIZE_RE: re.Pattern[str] = re.compile( + r"(?P\d{2,4})x(?P\d{2,4})", flags=re.IGNORECASE +) + + +class Favicon(NamedTuple): + url: str + format: str + width: int = 0 + height: int = 0 + + +def _has_content(text: Optional[str]) -> bool: + if text is None or len(text) == 0: + return False + else: + return True + + +# From https://github.com/scottwernervt/favicon/ +def _is_absolute(url: str) -> bool: + """Check if an URL is absolute. + + :param url: URL for site. + :type url: str + + :return: True if homepage and false if it has a path. + :rtype: bool + """ + return _has_content(urlparse(url).netloc) + + +def from_html( + html: str, root_url: Optional[str] = None, include_default_favicon: bool = False +) -> set[Favicon]: + """Extract all favicons in a given HTML. + + Args: + html: HTML to parse. + root_url: Root URL where the favicon is located. + include_default_favicon: Include /favicon.ico in the list when no other + favicons have been found + + Returns: + A set of favicons. + """ + page = BeautifulSoup(html, features="html.parser") + + # Handle the tag if it exists + # We priorize user's value for root_url over base tag + base_tag = page.find("base", href=True) + if base_tag is not None and root_url is None: + root_url = base_tag["href"] + + tags = set() + for rel in LINK_TAGS: + for link_tag in page.find_all( + "link", + attrs={"rel": lambda r: _has_content(r) and r.lower() == rel, "href": True}, + ): + tags.add(link_tag) + + meta_name = "msapplication-TileImage" + for meta_tag in page.find_all( + "meta", + attrs={ + "name": lambda n: _has_content(n) and n.lower() == meta_name.lower(), + "content": True, + }, + ): + tags.add(meta_tag) + + favicons = set() + for tag in tags: + href = tag.get("href") or tag.get("content") or "" + href = href.strip() + + # We skip if there is not content in href + if len(href) == 0: + continue + + if href[:5] == "data:": + # This is a inline base64 image + data_img = href.split(",") + suffix = ( + data_img[0] + .replace("data:", "") + .replace(";base64", "") + .replace("image", "") + .replace("/", "") + .lower() + ) + + if suffix == "svg+xml": + suffix = "svg" + + bytes_content = base64.b64decode(data_img[1]) + bytes_stream = io.BytesIO(bytes_content) + img = Image.open(bytes_stream) + width, height = img.size + + favicon = Favicon(href, suffix, width, height) + favicons.add(favicon) + continue + elif root_url is not None: + if _is_absolute(href) is True: + url_parsed = href + else: + url_parsed = urljoin(root_url, href) + + # Repair '//cdn.network.com/favicon.png' or `icon.png?v2` + scheme = urlparse(root_url).scheme + url_parsed = urlparse(url_parsed, scheme=scheme) + else: + url_parsed = urlparse(href) + + width, height = get_dimension(tag) + _, ext = os.path.splitext(url_parsed.path) + + favicon = Favicon(url_parsed.geturl(), ext[1:].lower(), width, height) + favicons.add(favicon) + + if include_default_favicon is True and len(favicons) == 0: + href = "/favicon.ico" + if root_url is not None: + url_parsed = urljoin(root_url, href) + else: + url_parsed = urlparse(href) + + _, ext = os.path.splitext(url_parsed.path) + + favicon = Favicon(url_parsed.geturl(), ext[1:].lower()) + + return favicons + + +def get_dimension(tag: Tag) -> Tuple[int, int]: + """Get icon dimensions from size attribute or icon filename. + + :param tag: Link or meta tag. + :type tag: :class:`bs4.element.Tag` + + :return: If found, width and height, else (0,0). + :rtype: tuple(int, int) + """ + sizes = tag.get("sizes", "") + if sizes and sizes != "any": + # "16x16 32x32 64x64" + size = sizes.split(" ") + size.sort(reverse=True) + width, height = re.split(r"[x\xd7]", size[0], flags=re.I) + else: + filename = tag.get("href") or tag.get("content") or "" + size = SIZE_RE.search(filename) + if size: + width, height = size.group("width"), size.group("height") + else: + width, height = "0", "0" + + # Repair bad html attribute values: sizes="192x192+" + width = "".join(c for c in width if c.isdigit()) + height = "".join(c for c in height if c.isdigit()) + + width = int(width) if _has_content(width) else 0 + height = int(height) if _has_content(height) else 0 + + return width, height diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/test_html.py b/test/test_html.py new file mode 100644 index 0000000..f51dcd5 --- /dev/null +++ b/test/test_html.py @@ -0,0 +1,194 @@ +import pytest + +import extract_favicon + + +HTML: str = """ + + + %content% + + +""" + + +@pytest.mark.parametrize( + "tag", + [ + '', + '', + '', + '', + '', + ], + ids=[ + "icon", + "ICON", + "shortcut icon", + "apple-touch-icon", + "apple-touch-icon-precomposed", + ], +) +def test_link_tag(tag: str): + favicons = extract_favicon.from_html(HTML.replace("%content%", tag)) + assert len(favicons) == 1 + + +@pytest.mark.parametrize( + "tag,size", + [ + ('', (0, 0)), + ('', (16, 16)), + ('', (24, 24)), + ('', (64, 64)), + ('', (64, 64)), + ('', (128, 128)), + ('', (16, 16)), + ( + '', + (192, 192), + ), + ], + ids=[ + "any", + "16x16", + "24x24+", + "32x32 64x64", + "64x64 32x32", + "logo-128x128.png", + "new york times", + "Uppercase X" + ], +) +def test_link_tag_sizes_attribute(tag, size): + favicons = extract_favicon.from_html(HTML.replace("%content%", tag)) + assert len(favicons) == 1 + icon = favicons.pop() + assert icon.width == size[0] and icon.height == size[1] + + +@pytest.mark.parametrize( + "tag,url", + [ + ('', "https://example.com/logo.png"), + ('', "https://example.com/logo.png"), + ( + '', + "https://example.com/static/logo.png", + ), + ( + '', + "https://example.com/logo.png", + ), + ( + '', + "https://example.com/logo.png", + ), + ( + '', + "https://example.com/logo.png?v2", + ), + ], + ids=[ + "filename", + "filename \\t", + "relative", + "https", + "forward slashes", + "query string", + ], +) +def test_link_tag_href_attribute(tag, url): + favicons = extract_favicon.from_html(HTML.replace("%content%", tag), root_url=url) + assert len(favicons) == 1 + assert favicons.pop().url == url + + +@pytest.mark.parametrize( + "tag", + [ + '', + ], + ids=[ + "Malformed icon size", + ], +) +def test_malformed_link(tag): + favicons = extract_favicon.from_html(HTML.replace("%content%", tag)) + assert len(favicons) == 1 + + +@pytest.mark.parametrize( + "tag", + [ + '', + '', + ], + ids=[ + "Href str length 0", + "No href", + ], +) +def test_link_tag_empty_href_attribute(tag): + favicons = extract_favicon.from_html(HTML.replace("%content%", tag)) + assert len(favicons) == 0 + + +@pytest.mark.parametrize( + "tag", + [ + '', + '', + ], + ids=["msapplication-TileImage", "msapplication-tileimage"], +) +def test_meta_tag(tag): + favicons = extract_favicon.from_html(HTML.replace("%content%", tag)) + assert len(favicons) == 1 + + +@pytest.mark.parametrize( + "tag", + [ + '', + '', + '', + ], + ids=["Missing meta", "Empty meta str length 0", "Empty meta content"], +) +def test_invalid_meta_tag(tag): + favicons = extract_favicon.from_html(HTML.replace("%content%", tag)) + assert len(favicons) == 0 + + +@pytest.mark.parametrize( + "tag", + [ + '', + ], + ids=["Base64 image"], +) +def test_base64(tag): + favicons = extract_favicon.from_html(HTML.replace("%content%", tag)) + assert len(favicons) == 1 + favicon = favicons.pop() + assert favicon.format == "png" + assert favicon.width == 1 + assert favicon.height == 1 + + +# Test to verify tag handling +@pytest.mark.parametrize( + "tag", + [ + '\n', + ], + ids=[ + "Base tag with relative icon", + ], +) +def test_base_tag_link(tag): + favicons = extract_favicon.from_html(HTML.replace("%content%", tag)) + assert len(favicons) == 1 + favicon = favicons.pop() + assert favicon.url == "http://example.com/favicon.jpg"