diff --git a/docs/_build/doctrees/advertools.crawlytics.doctree b/docs/_build/doctrees/advertools.crawlytics.doctree index 425bf907..54d8bd7c 100644 Binary files a/docs/_build/doctrees/advertools.crawlytics.doctree and b/docs/_build/doctrees/advertools.crawlytics.doctree differ diff --git a/docs/_build/doctrees/environment.pickle b/docs/_build/doctrees/environment.pickle index 6011344f..56919d31 100644 Binary files a/docs/_build/doctrees/environment.pickle and b/docs/_build/doctrees/environment.pickle differ diff --git a/docs/_build/html/_modules/advertools/crawlytics.html b/docs/_build/html/_modules/advertools/crawlytics.html index 507dab2a..d9b0fa1a 100644 --- a/docs/_build/html/_modules/advertools/crawlytics.html +++ b/docs/_build/html/_modules/advertools/crawlytics.html @@ -154,7 +154,7 @@

Source code for advertools.crawlytics

 
 >>> import advertools as adv
 >>> import pandas as pd
->>> crawldf = pd.read_json('path/to/output_file.jl', lines=True)
+>>> crawldf = pd.read_json("path/to/output_file.jl", lines=True)
 >>> img_df = adv.crawlytics.images(crawldf)
 >>> img_df
 
@@ -211,7 +211,7 @@ 

Source code for advertools.crawlytics

 The ``crawlytics.links`` function gives you a summary of the links, that is similar to
 the format of the ``crawlytics.images`` DataFrame.
 
->>> link_df = adv.crawlytics.links(crawldf, internal_url_regex='nytimes.com')
+>>> link_df = adv.crawlytics.links(crawldf, internal_url_regex="nytimes.com")
 >>> link_df
 
 ====  ===========================================================  ========================================================================  ==================  ==========  ==========
@@ -301,9 +301,10 @@ 

Source code for advertools.crawlytics

 columns of interest, write them to a new file, and delete the old large crawl file.
 
 >>> crawl_subset = adv.crawlytics.jl_subset(
-...    filepath='/path/to/output_file.jl',
-...    columns=[col1, col2, ...],
-...    regex=column_regex)
+...     filepath="/path/to/output_file.jl",
+...     columns=[col1, col2, ...],
+...     regex=column_regex,
+... )
 
 You can use the ``columns`` parameter to specify exactly which columns you want. You can
 also use a regular expression to specify a set of columns. Here are some examples of
@@ -346,7 +347,7 @@ 

Source code for advertools.crawlytics

 One of the main advantags of using parquet is that you can select which columns you want
 to read.
 
->>> adv.crawlytics.parquet_columns('output_file.parquet') # first 15 columns only
+>>> adv.crawlytics.parquet_columns("output_file.parquet")  # first 15 columns only
 
 ====  ==============  ======
   ..  column          type
@@ -370,7 +371,7 @@ 

Source code for advertools.crawlytics

 
 Check how many columns we have of each type.
 
->>> adv.crawlytics.parquet_columns('nyt_crawl.parquet')['type'].value_counts()
+>>> adv.crawlytics.parquet_columns("nyt_crawl.parquet")["type"].value_counts()
 
 ====  =========================================================================================================================================================  =======
   ..  type                                                                                                                                                         count
@@ -410,6 +411,7 @@ 

Source code for advertools.crawlytics

     "jl_to_parquet",
     "parquet_columns",
     "compare",
+    "running_crawls",
 ]
 
 
@@ -433,7 +435,7 @@ 

Source code for advertools.crawlytics

     --------
     >>> import advertools as adv
     >>> import pandas as pd
-    >>> crawldf = pd.read_json('output_file.jl', lines=True)
+    >>> crawldf = pd.read_json("output_file.jl", lines=True)
     >>> redirect_df = adv.crawlytics.redirects(crawldf)
     >>> redirect_df
 
@@ -516,7 +518,7 @@ 

Source code for advertools.crawlytics

     --------
     >>> import advertools as adv
     >>> import pandas as pd
-    >>> crawldf = pd.read_json('output_file.jl', lines=True)
+    >>> crawldf = pd.read_json("output_file.jl", lines=True)
     >>> link_df = adv.crawlytics.links(crawldf)
     >>> link_df
 
@@ -581,7 +583,7 @@ 

Source code for advertools.crawlytics

     --------
     >>> import advertools as adv
     >>> import pandas as pd
-    >>> crawldf = pd.read_json('output_file.jl', lines=True)
+    >>> crawldf = pd.read_json("output_file.jl", lines=True)
     >>> image_df = adv.crawlytics.images(crawldf)
     >>> image_df
 
@@ -643,15 +645,17 @@ 

Source code for advertools.crawlytics

 
     Read only the columns "url" and "meta_desc":
 
-    >>> adv.crawlytics.jl_subset('output_file.jl', columns=['url', 'meta_desc'])
+    >>> adv.crawlytics.jl_subset("output_file.jl", columns=["url", "meta_desc"])
 
     Read columns matching the regex "jsonld":
 
-    >>> adv.crawlytics.jl_subset('output_file.jl', regex='jsonld')
+    >>> adv.crawlytics.jl_subset("output_file.jl", regex="jsonld")
 
     Read the columns "url" and "meta_desc" as well as columns matching "jsonld":
 
-    >>> adv.crawlytics.jl_subset('output_file.jl', columns=['url', 'meta_desc'], regex='jsonld')
+    >>> adv.crawlytics.jl_subset(
+    ...     "output_file.jl", columns=["url", "meta_desc"], regex="jsonld"
+    ... )
 
     Returns
     -------
@@ -766,9 +770,9 @@ 

Source code for advertools.crawlytics

 
     >>> import advertools as adv
     >>> import pandas as pd
-    >>> df1 = pd.read_json('output_file1.jl', lines=True)
-    >>> df2 = pd.read_json('output_file2.jl', lines=True)
-    >>> adv.crawlytics.compare(df1, df1, 'size')
+    >>> df1 = pd.read_json("output_file1.jl", lines=True)
+    >>> df2 = pd.read_json("output_file2.jl", lines=True)
+    >>> adv.crawlytics.compare(df1, df1, "size")
 
     ====  ==========================  ========  ========  ======  ===========
       ..  url                           size_x    size_y    diff    diff_perc
@@ -806,6 +810,8 @@ 

Source code for advertools.crawlytics

 
 
 
+
+[docs] def running_crawls(): """Get details of currently running spiders. @@ -816,11 +822,47 @@

Source code for advertools.crawlytics

     * elapsed: The elapsed time since the spider started.
     * %mem: The percentage of memory that this spider is consuming.
     * %cpu: The percentage of CPU that this spider is consuming.
-    * args: The full command that was used to start this spider. Use this to identify
+    * command: The command that was used to start this spider. Use this to identify
       the spider(s) that you want to know about.
     * output_file: The path to the output file for each running crawl job.
     * crawled_urls: The current number of lines in ``output_file``.
+
+    Examples
+    --------
+    While a crawl is running:
+
+    >>> import advertools as adv
+    >>> adv.crawlytics.running_crawls()
+
+    ====  ======  =========  =========  ======  ======  =========================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
+      ..     pid  started    elapsed      %mem    %cpu  command                                                                                                                                                                                                                                                                                                                                                                                                    output_file      crawled_urls
+    ====  ======  =========  =========  ======  ======  =========================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
+       0  195720  21:41:14   00:11         1.1     103  /opt/tljh/user/bin/python /opt/tljh/user/bin/scrapy runspider /opt/tljh/user/lib/python3.10/site-packages/advertools/spider.py -a url_list=https://cnn.com -a allowed_domains=cnn.com -a follow_links=True -a exclude_url_params=None -a include_url_params=None -a exclude_url_regex=None -a include_url_regex=None -a css_selectors=None -a xpath_selectors=None -o cnn.jl -s CLOSESPIDER_PAGECOUNT=200  cnn.jl                     30
+    ====  ======  =========  =========  ======  ======  =========================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
+
+    After a few moments:
+
+    >>> adv.crawlytics.running_crawls()
+
+    ====  ======  =========  =========  ======  ======  =========================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
+      ..     pid  started    elapsed      %mem    %cpu  command                                                                                                                                                                                                                                                                                                                                                                                                    output_file      crawled_urls
+    ====  ======  =========  =========  ======  ======  =========================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
+       0  195720  21:41:14   00:27         1.2    96.7  /opt/tljh/user/bin/python /opt/tljh/user/bin/scrapy runspider /opt/tljh/user/lib/python3.10/site-packages/advertools/spider.py -a url_list=https://cnn.com -a allowed_domains=cnn.com -a follow_links=True -a exclude_url_params=None -a include_url_params=None -a exclude_url_regex=None -a include_url_regex=None -a css_selectors=None -a xpath_selectors=None -o cnn.jl -s CLOSESPIDER_PAGECOUNT=200  cnn.jl                     72
+    ====  ======  =========  =========  ======  ======  =========================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
+
+    After starting a new crawl:
+
+    >>> adv.crawlytics.running_crawls()
+
+    ====  ======  =========  =========  ======  ======  =================================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
+      ..     pid  started    elapsed      %mem    %cpu  command                                                                                                                                                                                                                                                                                                                                                                                                            output_file      crawled_urls
+    ====  ======  =========  =========  ======  ======  =================================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
+       0  195720  21:41:14   01:02         1.6    95.7  /opt/tljh/user/bin/python /opt/tljh/user/bin/scrapy runspider /opt/tljh/user/lib/python3.10/site-packages/advertools/spider.py -a url_list=https://cnn.com -a allowed_domains=cnn.com -a follow_links=True -a exclude_url_params=None -a include_url_params=None -a exclude_url_regex=None -a include_url_regex=None -a css_selectors=None -a xpath_selectors=None -o cnn.jl -s CLOSESPIDER_PAGECOUNT=200          cnn.jl                    154
+       1  195769  21:42:09   00:07         0.4    83.8  /opt/tljh/user/bin/python /opt/tljh/user/bin/scrapy runspider /opt/tljh/user/lib/python3.10/site-packages/advertools/spider.py -a url_list=https://nytimes.com -a allowed_domains=nytimes.com -a follow_links=True -a exclude_url_params=None -a include_url_params=None -a exclude_url_regex=None -a include_url_regex=None -a css_selectors=None -a xpath_selectors=None -o nyt.jl -s CLOSESPIDER_PAGECOUNT=200  nyt.jl                     17
+    ====  ======  =========  =========  ======  ======  =================================================================================================================================================================================================================================================================================================================================================================================================================  =============  ==============
     """
+    if platform.system() == "Windows":
+        return "This is function does not support Windows yet. Will be, soon. Sorry!"
     ps = run(["ps", "xo", "pid,start,etime,%mem,%cpu,args"])
     ps_stdout = ps.stdout.splitlines()
     df = pd.DataFrame(
@@ -842,7 +884,8 @@ 

Source code for advertools.crawlytics

         crawl_urls = crawl_urls[: min(len(crawl_urls), len(df_subset))]
         df_subset["crawled_urls"] = crawl_urls
     df_subset.columns = df_subset.columns.str.lower()
-    return df_subset.rename(columns={"args": "command"})
+    return df_subset.rename(columns={"args": "command"})
+
diff --git a/docs/_build/html/advertools.crawlytics.html b/docs/_build/html/advertools.crawlytics.html index 02d00e67..d0da7a71 100644 --- a/docs/_build/html/advertools.crawlytics.html +++ b/docs/_build/html/advertools.crawlytics.html @@ -156,7 +156,7 @@

Analyzing crawled images
>>> import advertools as adv
 >>> import pandas as pd
->>> crawldf = pd.read_json('path/to/output_file.jl', lines=True)
+>>> crawldf = pd.read_json("path/to/output_file.jl", lines=True)
 >>> img_df = adv.crawlytics.images(crawldf)
 >>> img_df
 
@@ -333,7 +333,7 @@

Analyzing links in a crawled websitecrawlytics.links function gives you a summary of the links, that is similar to the format of the crawlytics.images DataFrame.

-

You can use the columns parameter to specify exactly which columns you want. You can @@ -651,7 +652,7 @@

Exploring the columns and data types of parquet filesAnother simple function gives us a DataFrame of the available columns in a parquet file. One of the main advantags of using parquet is that you can select which columns you want to read.

-
>>> adv.crawlytics.parquet_columns('output_file.parquet') # first 15 columns only
+
>>> adv.crawlytics.parquet_columns("output_file.parquet")  # first 15 columns only
 
@@ -725,7 +726,7 @@

Exploring the columns and data types of parquet files

Check how many columns we have of each type.

-
>>> adv.crawlytics.parquet_columns('nyt_crawl.parquet')['type'].value_counts()
+
>>> adv.crawlytics.parquet_columns("nyt_crawl.parquet")["type"].value_counts()
 
@@ -814,9 +815,9 @@

Module functionsExamples

>>> import advertools as adv
 >>> import pandas as pd
->>> df1 = pd.read_json('output_file1.jl', lines=True)
->>> df2 = pd.read_json('output_file2.jl', lines=True)
->>> adv.crawlytics.compare(df1, df1, 'size')
+>>> df1 = pd.read_json("output_file1.jl", lines=True)
+>>> df2 = pd.read_json("output_file2.jl", lines=True)
+>>> adv.crawlytics.compare(df1, df1, "size")
 

@@ -888,7 +889,7 @@

Module functionsExamples

>>> import advertools as adv
 >>> import pandas as pd
->>> crawldf = pd.read_json('output_file.jl', lines=True)
+>>> crawldf = pd.read_json("output_file.jl", lines=True)
 >>> image_df = adv.crawlytics.images(crawldf)
 >>> image_df
 
@@ -1030,15 +1031,17 @@

Module functions
>>> adv.crawlytics.jl_subset('output_file.jl', columns=['url', 'meta_desc'])
+
>>> adv.crawlytics.jl_subset("output_file.jl", columns=["url", "meta_desc"])
 

Read columns matching the regex "jsonld":

-
>>> adv.crawlytics.jl_subset('output_file.jl', regex='jsonld')
+
>>> adv.crawlytics.jl_subset("output_file.jl", regex="jsonld")
 

Read the columns "url" and "meta_desc" as well as columns matching "jsonld":

-
>>> adv.crawlytics.jl_subset('output_file.jl', columns=['url', 'meta_desc'], regex='jsonld')
+
>>> adv.crawlytics.jl_subset(
+...     "output_file.jl", columns=["url", "meta_desc"], regex="jsonld"
+... )
 
@@ -1092,7 +1095,7 @@

Module functionsExamples

+ + + + + + + + + + + + + + + + + + + + + + + + +

pid

started

elapsed

%mem

%cpu

command

output_file

crawled_urls

0

195720

21:41:14

00:11

1.1

103

/opt/tljh/user/bin/python /opt/tljh/user/bin/scrapy runspider /opt/tljh/user/lib/python3.10/site-packages/advertools/spider.py -a url_list=https://cnn.com -a allowed_domains=cnn.com -a follow_links=True -a exclude_url_params=None -a include_url_params=None -a exclude_url_regex=None -a include_url_regex=None -a css_selectors=None -a xpath_selectors=None -o cnn.jl -s CLOSESPIDER_PAGECOUNT=200

cnn.jl

30

+

After a few moments:

+
>>> adv.crawlytics.running_crawls()
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + +

pid

started

elapsed

%mem

%cpu

command

output_file

crawled_urls

0

195720

21:41:14

00:27

1.2

96.7

/opt/tljh/user/bin/python /opt/tljh/user/bin/scrapy runspider /opt/tljh/user/lib/python3.10/site-packages/advertools/spider.py -a url_list=https://cnn.com -a allowed_domains=cnn.com -a follow_links=True -a exclude_url_params=None -a include_url_params=None -a exclude_url_regex=None -a include_url_regex=None -a css_selectors=None -a xpath_selectors=None -o cnn.jl -s CLOSESPIDER_PAGECOUNT=200

cnn.jl

72

+

After starting a new crawl:

+
>>> adv.crawlytics.running_crawls()
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

pid

started

elapsed

%mem

%cpu

command

output_file

crawled_urls

0

195720

21:41:14

01:02

1.6

95.7

/opt/tljh/user/bin/python /opt/tljh/user/bin/scrapy runspider /opt/tljh/user/lib/python3.10/site-packages/advertools/spider.py -a url_list=https://cnn.com -a allowed_domains=cnn.com -a follow_links=True -a exclude_url_params=None -a include_url_params=None -a exclude_url_regex=None -a include_url_regex=None -a css_selectors=None -a xpath_selectors=None -o cnn.jl -s CLOSESPIDER_PAGECOUNT=200

cnn.jl

154

1

195769

21:42:09

00:07

0.4

83.8

/opt/tljh/user/bin/python /opt/tljh/user/bin/scrapy runspider /opt/tljh/user/lib/python3.10/site-packages/advertools/spider.py -a url_list=https://nytimes.com -a allowed_domains=nytimes.com -a follow_links=True -a exclude_url_params=None -a include_url_params=None -a exclude_url_regex=None -a include_url_regex=None -a css_selectors=None -a xpath_selectors=None -o nyt.jl -s CLOSESPIDER_PAGECOUNT=200

nyt.jl

17

+ +