From a9abdb0653a399237d835e247d9b42917325e702 Mon Sep 17 00:00:00 2001 From: Graham Knop Date: Fri, 17 May 2024 01:28:07 +0200 Subject: [PATCH] remove sitemap generation Sitemaps are not working at the moment since the web server has been moved to kubernetes. The sitemap generation had no code in common with the rest of this repo, and will be replaced by https://github.com/metacpan/metacpan-sitemap --- bin/generate_sitemap.pl | 46 ---------------- lib/MetaCPAN/Sitemap.pm | 119 ---------------------------------------- 2 files changed, 165 deletions(-) delete mode 100755 bin/generate_sitemap.pl delete mode 100644 lib/MetaCPAN/Sitemap.pm diff --git a/bin/generate_sitemap.pl b/bin/generate_sitemap.pl deleted file mode 100755 index 1603256447b..00000000000 --- a/bin/generate_sitemap.pl +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env perl - -# Generate the sitemap XML files for the robots.txt file. - -use strict; -use warnings; - -use File::Basename (); -use File::Spec (); -use Cwd (); -use Config::ZOMG (); - -my $root_dir; - -BEGIN { - my $bin_dir = File::Basename::dirname(__FILE__); - $root_dir - = Cwd::abs_path( File::Spec->catdir( $bin_dir, File::Spec->updir ) ); -} -use lib "$root_dir/lib"; -use MetaCPAN::Sitemap (); - -my $config = Config::ZOMG->open( - name => 'MetaCPAN::Web', - path => $root_dir, -); - -my $out_dir = "$root_dir/root/static/sitemaps/"; -mkdir $out_dir; - -my $web_host = $config->{web_host}; -$web_host =~ s{/\z}{}; -my $sitemaps = $config->{sitemap}; - -for my $file ( sort keys %$sitemaps ) { - my %sm_config = %{ $sitemaps->{$file} }; - my $full_file = $out_dir . $file; - $sm_config{url_prefix} ||= do { - my $metacpan_url = $sm_config{metacpan_url}; - s{/\z}{}, s{\A/}{} for $metacpan_url; - "$web_host/$metacpan_url/"; - }; - $sm_config{api} = $config->{api}; - my $sitemap = MetaCPAN::Sitemap->new(%sm_config); - $sitemap->write($full_file); -} diff --git a/lib/MetaCPAN/Sitemap.pm b/lib/MetaCPAN/Sitemap.pm deleted file mode 100644 index 7e6807b19db..00000000000 --- a/lib/MetaCPAN/Sitemap.pm +++ /dev/null @@ -1,119 +0,0 @@ -package MetaCPAN::Sitemap; -use strict; -use warnings; -use IO::Socket::SSL qw( SSL_VERIFY_PEER ); -use IO::Async::Loop (); -use Net::Async::HTTP (); -use Cpanel::JSON::XS (); -use IO::Compress::Gzip (); -use HTML::Entities qw( encode_entities_numeric ); -use Future (); - -use Moo; - -has api => ( is => 'ro', required => 1 ); -has url_prefix => ( is => 'ro', required => 1 ); -has object_type => ( is => 'ro', required => 1 ); -has field_name => ( is => 'ro', required => 1 ); -has filter => ( is => 'ro' ); -has size => ( is => 'ro', default => 1000 ); -has loop => ( is => 'lazy', default => sub { IO::Async::Loop->new } ); -has ua => ( - is => 'lazy', - default => sub { - my $self = shift; - my $http = Net::Async::HTTP->new( - user_agent => - 'MetaCPAN-Web/1.0 (https://github.com/metacpan/metacpan-web)', - max_connections_per_host => 5, - SSL_verify_mode => SSL_VERIFY_PEER, - timeout => 10, - ); - $self->loop->add($http); - $http; - } -); - -sub DEMOLISH { - $_[0]->ua->remove_from_parent; -} - -# Mandatory arguments to this function are -# [] search object_type (author and release) -# [] result field_name (pauseid and distribution) -# [] name of output xml_file (path to the output XML file) -# Optional arguments to this function are -# [] output cpan_directory (author and release) -# [] test_search (search count - if non-zero, limits search to that number of -# items for testing) -# [] filter - contains filter for a field that also needs to be included in -# the list of form fields. - -my $json = Cpanel::JSON::XS->new->utf8->canonical; - -sub _request { - my ( $self, $content, $cb ) = @_; - my $url = $self->api . '/'; - my $content_type = 'text/plain'; - if ( ref $content ) { - $url .= $self->object_type . '/'; - $content_type = 'application/json'; - $content = $json->encode($content); - } - $url .= '_search/scroll?scroll=1m&size=' . $self->size; - $self->ua->POST( $url, $content, content_type => $content_type, ) - ->then( sub { - my $response = shift; - my $content = $json->decode( $response->content ); - return Future->done - if !@{ $content->{hits}{hits} }; - $cb->( $content->{hits}{hits} ); - return $self->_request( $content->{_scroll_id}, $cb ); - } ); -} - -sub write { - my ( $self, $file ) = @_; - - my $fh = IO::Compress::Gzip->new( $file . '.new' ); - $fh->print(<<'END_XML_HEADER'); - - -END_XML_HEADER - - $self->_request( - { - fields => [ $self->field_name ], - query => { match_all => {} }, - ( $self->filter ? ( filter => $self->filter ) : () ), - sort => [ $self->field_name ], - }, - sub { - my $hits = shift; - for my $hit (@$hits) { - my $link_field = $hit->{fields}{ $self->field_name }; - $link_field = $link_field->[0] if ref $link_field; - my $url = $self->url_prefix . $link_field; - $fh->print( ' ' - . encode_entities_numeric($url) - . "\n" ); - } - } - )->get; - $fh->print("\n"); - $fh->close; - rename "$file.new", "$file"; - return; -} - -1; -__END__ - -=head1 DESCRIPTION - -Generate an XML file containing URLs use by the robots.txt Sitemap. We use this -module to generate one each for authors, modules and releases. - -=cut