diff --git a/main/404.html b/main/404.html new file mode 100644 index 00000000..efac6378 --- /dev/null +++ b/main/404.html @@ -0,0 +1,2292 @@ + + + + + + + + + + + + + + + + + + EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ +

404 - Not found

+ +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/alternatives/index.html b/main/alternatives/index.html new file mode 100644 index 00000000..52f73251 --- /dev/null +++ b/main/alternatives/index.html @@ -0,0 +1,2338 @@ + + + + + + + + + + + + + + + + + + + + + + Alternatives & Comparison - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + Skip to content + + +
+
+ +
+ + + + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + + + + +

Alternatives & Comparison

+

EDS-PDF was developed to propose a more modular and extendable approach to PDF extraction than PDFBox, the legacy implementation at APHP's clinical data warehouse.

+

EDS-PDF takes inspiration from Explosion's spaCy pipelining system and closely follows its API. Therefore, the core object within EDS-PDF is the Pipeline, which organises the processing of PDF documents into multiple components. However, unlike spaCy, the library is built around a single deep learning framework, pytorch, which makes model development easier.

+

    + + + + + + +
    +
    + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/assets/_mkdocstrings.css b/main/assets/_mkdocstrings.css new file mode 100644 index 00000000..049a254b --- /dev/null +++ b/main/assets/_mkdocstrings.css @@ -0,0 +1,64 @@ + +/* Avoid breaking parameter names, etc. in table cells. */ +.doc-contents td code { + word-break: normal !important; +} + +/* No line break before first paragraph of descriptions. */ +.doc-md-description, +.doc-md-description>p:first-child { + display: inline; +} + +/* Max width for docstring sections tables. */ +.doc .md-typeset__table, +.doc .md-typeset__table table { + display: table !important; + width: 100%; +} + +.doc .md-typeset__table tr { + display: table-row; +} + +/* Defaults in Spacy table style. */ +.doc-param-default { + float: right; +} + +/* Keep headings consistent. */ +h1.doc-heading, +h2.doc-heading, +h3.doc-heading, +h4.doc-heading, +h5.doc-heading, +h6.doc-heading { + font-weight: 400; + line-height: 1.5; + color: inherit; + text-transform: none; +} + +h1.doc-heading { + font-size: 1.6rem; +} + +h2.doc-heading { + font-size: 1.2rem; +} + +h3.doc-heading { + font-size: 1.15rem; +} + +h4.doc-heading { + font-size: 1.10rem; +} + +h5.doc-heading { + font-size: 1.05rem; +} + +h6.doc-heading { + font-size: 1rem; +} \ No newline at end of file diff --git a/main/assets/images/favicon.png b/main/assets/images/favicon.png new file mode 100644 index 00000000..1cf13b9f Binary files /dev/null and b/main/assets/images/favicon.png differ diff --git a/main/assets/images/model-parallelism.png b/main/assets/images/model-parallelism.png new file mode 100644 index 00000000..45543956 Binary files /dev/null and b/main/assets/images/model-parallelism.png differ diff --git a/main/assets/images/multiprocessing.svg b/main/assets/images/multiprocessing.svg new file mode 100644 index 00000000..594b0d04 --- /dev/null +++ b/main/assets/images/multiprocessing.svg @@ -0,0 +1,3 @@ + + +
    CPU Worker 1
    CPU Worker 1
    CPU Worker 2
    CPU Worker 2
    CPU Worker 3
    CPU Worker 3
    GPU Worker 1
    GPU Worker 1
    GPU Worker 2
    GPU Worker 2

    batch_id = 8
    cpu_id = 1
    gpu_id = 0
    stage = 2
    forward out = ...

    batch_id = 8...

    batch_id = 28
    cpu_id = 2
    gpu_id = 1
    stage = 0
    collate out = ...

    batch_id = 28...
    Inputs
    Inputs
    Non deep-learning ops:
    - extractors
    - aggregators
    - feature preprocessing
    - feature collating
    - forward output postproc.
    Non deep-learning ops:...
    Deep-learning ops:
    - forward
    Deep-learning ops:...

    batch_id = 28
    cpu_id = 2
    gpu_id = ?
    stage = 0
    input doc = ...

    batch_id = 28...
    Outputs
    Outputs
    Text is not SVG - cannot display
    diff --git a/main/assets/images/transformer-windowing.svg b/main/assets/images/transformer-windowing.svg new file mode 100644 index 00000000..9b79e761 --- /dev/null +++ b/main/assets/images/transformer-windowing.svg @@ -0,0 +1,3 @@ + + +
    [CLS]
    [CLS]
    The
    The
    echo
    echo
    gra
    gra
    phy
    phy
    shows
    shows
    no
    no
    sign
    sign
    of
    of
    cancer
    cancer
    [CLS]
    [CLS]
    The
    The
    echo
    echo
    gra
    gra
    phy
    phy
    shows
    shows
    no
    no
    [END]
    [END]
    [END]
    [END]
    shows
    shows
    no
    no
    sign
    sign
    of
    of
    cancer
    cancer
    [END]
    [END]
    [END]
    [END]
    [CLS]
    [CLS]
    [PAD]
    [PAD]
    V1
    V1
    V2
    V2
    V3
    V3
    V4
    V4
    V1
    V1
    V2
    V2
    V3
    V3
    V4
    V4
    V1
    V1
    V2
    V2
    V3
    V3
    V4
    V4
    LayoutLMv3
    LayoutLMv3
    [CLS]
    [CLS]
    shows
    shows
    no
    no
    sign
    sign
    of
    of
    cancer
    cancer
    no
    no
    [END]
    [END]
    V1
    V1
    V2
    V2
    V3
    V3
    V4
    V4
    [CLS]
    [CLS]
    The
    The
    echo
    echo
    gra
    gra
    phy
    phy
    shows
    shows
    no
    no
    [END]
    [END]
    V1
    V1
    V2
    V2
    V3
    V3
    V4
    V4
    The
    The
    echo
    echo
    gra
    gra
    phy
    phy
    shows
    shows
    no
    no
    sign
    sign
    of
    of
    cancer
    cancer
    Reconstruction
    Reconstruction
    Windowing
    Windowing
    Classification
    Classification
    Line n°1
    Line n°1
    Line n°2
    Line n°2
    Body
    Body
    Body
    Body
    Text is not SVG - cannot display
    diff --git a/main/assets/javascripts/bundle.220ee61c.min.js b/main/assets/javascripts/bundle.220ee61c.min.js new file mode 100644 index 00000000..116072a1 --- /dev/null +++ b/main/assets/javascripts/bundle.220ee61c.min.js @@ -0,0 +1,29 @@ +"use strict";(()=>{var Ci=Object.create;var gr=Object.defineProperty;var Ri=Object.getOwnPropertyDescriptor;var ki=Object.getOwnPropertyNames,Ht=Object.getOwnPropertySymbols,Hi=Object.getPrototypeOf,yr=Object.prototype.hasOwnProperty,nn=Object.prototype.propertyIsEnumerable;var rn=(e,t,r)=>t in e?gr(e,t,{enumerable:!0,configurable:!0,writable:!0,value:r}):e[t]=r,P=(e,t)=>{for(var r in t||(t={}))yr.call(t,r)&&rn(e,r,t[r]);if(Ht)for(var r of Ht(t))nn.call(t,r)&&rn(e,r,t[r]);return e};var on=(e,t)=>{var r={};for(var n in e)yr.call(e,n)&&t.indexOf(n)<0&&(r[n]=e[n]);if(e!=null&&Ht)for(var n of Ht(e))t.indexOf(n)<0&&nn.call(e,n)&&(r[n]=e[n]);return r};var Pt=(e,t)=>()=>(t||e((t={exports:{}}).exports,t),t.exports);var Pi=(e,t,r,n)=>{if(t&&typeof t=="object"||typeof t=="function")for(let o of ki(t))!yr.call(e,o)&&o!==r&&gr(e,o,{get:()=>t[o],enumerable:!(n=Ri(t,o))||n.enumerable});return e};var yt=(e,t,r)=>(r=e!=null?Ci(Hi(e)):{},Pi(t||!e||!e.__esModule?gr(r,"default",{value:e,enumerable:!0}):r,e));var sn=Pt((xr,an)=>{(function(e,t){typeof xr=="object"&&typeof an!="undefined"?t():typeof define=="function"&&define.amd?define(t):t()})(xr,function(){"use strict";function e(r){var n=!0,o=!1,i=null,s={text:!0,search:!0,url:!0,tel:!0,email:!0,password:!0,number:!0,date:!0,month:!0,week:!0,time:!0,datetime:!0,"datetime-local":!0};function a(O){return!!(O&&O!==document&&O.nodeName!=="HTML"&&O.nodeName!=="BODY"&&"classList"in O&&"contains"in O.classList)}function f(O){var Qe=O.type,De=O.tagName;return!!(De==="INPUT"&&s[Qe]&&!O.readOnly||De==="TEXTAREA"&&!O.readOnly||O.isContentEditable)}function c(O){O.classList.contains("focus-visible")||(O.classList.add("focus-visible"),O.setAttribute("data-focus-visible-added",""))}function u(O){O.hasAttribute("data-focus-visible-added")&&(O.classList.remove("focus-visible"),O.removeAttribute("data-focus-visible-added"))}function p(O){O.metaKey||O.altKey||O.ctrlKey||(a(r.activeElement)&&c(r.activeElement),n=!0)}function m(O){n=!1}function d(O){a(O.target)&&(n||f(O.target))&&c(O.target)}function h(O){a(O.target)&&(O.target.classList.contains("focus-visible")||O.target.hasAttribute("data-focus-visible-added"))&&(o=!0,window.clearTimeout(i),i=window.setTimeout(function(){o=!1},100),u(O.target))}function v(O){document.visibilityState==="hidden"&&(o&&(n=!0),Y())}function Y(){document.addEventListener("mousemove",N),document.addEventListener("mousedown",N),document.addEventListener("mouseup",N),document.addEventListener("pointermove",N),document.addEventListener("pointerdown",N),document.addEventListener("pointerup",N),document.addEventListener("touchmove",N),document.addEventListener("touchstart",N),document.addEventListener("touchend",N)}function B(){document.removeEventListener("mousemove",N),document.removeEventListener("mousedown",N),document.removeEventListener("mouseup",N),document.removeEventListener("pointermove",N),document.removeEventListener("pointerdown",N),document.removeEventListener("pointerup",N),document.removeEventListener("touchmove",N),document.removeEventListener("touchstart",N),document.removeEventListener("touchend",N)}function N(O){O.target.nodeName&&O.target.nodeName.toLowerCase()==="html"||(n=!1,B())}document.addEventListener("keydown",p,!0),document.addEventListener("mousedown",m,!0),document.addEventListener("pointerdown",m,!0),document.addEventListener("touchstart",m,!0),document.addEventListener("visibilitychange",v,!0),Y(),r.addEventListener("focus",d,!0),r.addEventListener("blur",h,!0),r.nodeType===Node.DOCUMENT_FRAGMENT_NODE&&r.host?r.host.setAttribute("data-js-focus-visible",""):r.nodeType===Node.DOCUMENT_NODE&&(document.documentElement.classList.add("js-focus-visible"),document.documentElement.setAttribute("data-js-focus-visible",""))}if(typeof window!="undefined"&&typeof document!="undefined"){window.applyFocusVisiblePolyfill=e;var t;try{t=new CustomEvent("focus-visible-polyfill-ready")}catch(r){t=document.createEvent("CustomEvent"),t.initCustomEvent("focus-visible-polyfill-ready",!1,!1,{})}window.dispatchEvent(t)}typeof document!="undefined"&&e(document)})});var cn=Pt(Er=>{(function(e){var t=function(){try{return!!Symbol.iterator}catch(c){return!1}},r=t(),n=function(c){var u={next:function(){var p=c.shift();return{done:p===void 0,value:p}}};return r&&(u[Symbol.iterator]=function(){return u}),u},o=function(c){return encodeURIComponent(c).replace(/%20/g,"+")},i=function(c){return decodeURIComponent(String(c).replace(/\+/g," "))},s=function(){var c=function(p){Object.defineProperty(this,"_entries",{writable:!0,value:{}});var m=typeof p;if(m!=="undefined")if(m==="string")p!==""&&this._fromString(p);else if(p instanceof c){var d=this;p.forEach(function(B,N){d.append(N,B)})}else if(p!==null&&m==="object")if(Object.prototype.toString.call(p)==="[object Array]")for(var h=0;hd[0]?1:0}),c._entries&&(c._entries={});for(var p=0;p1?i(d[1]):"")}})})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Er);(function(e){var t=function(){try{var o=new e.URL("b","http://a");return o.pathname="c d",o.href==="http://a/c%20d"&&o.searchParams}catch(i){return!1}},r=function(){var o=e.URL,i=function(f,c){typeof f!="string"&&(f=String(f)),c&&typeof c!="string"&&(c=String(c));var u=document,p;if(c&&(e.location===void 0||c!==e.location.href)){c=c.toLowerCase(),u=document.implementation.createHTMLDocument(""),p=u.createElement("base"),p.href=c,u.head.appendChild(p);try{if(p.href.indexOf(c)!==0)throw new Error(p.href)}catch(O){throw new Error("URL unable to set base "+c+" due to "+O)}}var m=u.createElement("a");m.href=f,p&&(u.body.appendChild(m),m.href=m.href);var d=u.createElement("input");if(d.type="url",d.value=f,m.protocol===":"||!/:/.test(m.href)||!d.checkValidity()&&!c)throw new TypeError("Invalid URL");Object.defineProperty(this,"_anchorElement",{value:m});var h=new e.URLSearchParams(this.search),v=!0,Y=!0,B=this;["append","delete","set"].forEach(function(O){var Qe=h[O];h[O]=function(){Qe.apply(h,arguments),v&&(Y=!1,B.search=h.toString(),Y=!0)}}),Object.defineProperty(this,"searchParams",{value:h,enumerable:!0});var N=void 0;Object.defineProperty(this,"_updateSearchParams",{enumerable:!1,configurable:!1,writable:!1,value:function(){this.search!==N&&(N=this.search,Y&&(v=!1,this.searchParams._fromString(this.search),v=!0))}})},s=i.prototype,a=function(f){Object.defineProperty(s,f,{get:function(){return this._anchorElement[f]},set:function(c){this._anchorElement[f]=c},enumerable:!0})};["hash","host","hostname","port","protocol"].forEach(function(f){a(f)}),Object.defineProperty(s,"search",{get:function(){return this._anchorElement.search},set:function(f){this._anchorElement.search=f,this._updateSearchParams()},enumerable:!0}),Object.defineProperties(s,{toString:{get:function(){var f=this;return function(){return f.href}}},href:{get:function(){return this._anchorElement.href.replace(/\?$/,"")},set:function(f){this._anchorElement.href=f,this._updateSearchParams()},enumerable:!0},pathname:{get:function(){return this._anchorElement.pathname.replace(/(^\/?)/,"/")},set:function(f){this._anchorElement.pathname=f},enumerable:!0},origin:{get:function(){var f={"http:":80,"https:":443,"ftp:":21}[this._anchorElement.protocol],c=this._anchorElement.port!=f&&this._anchorElement.port!=="";return this._anchorElement.protocol+"//"+this._anchorElement.hostname+(c?":"+this._anchorElement.port:"")},enumerable:!0},password:{get:function(){return""},set:function(f){},enumerable:!0},username:{get:function(){return""},set:function(f){},enumerable:!0}}),i.createObjectURL=function(f){return o.createObjectURL.apply(o,arguments)},i.revokeObjectURL=function(f){return o.revokeObjectURL.apply(o,arguments)},e.URL=i};if(t()||r(),e.location!==void 0&&!("origin"in e.location)){var n=function(){return e.location.protocol+"//"+e.location.hostname+(e.location.port?":"+e.location.port:"")};try{Object.defineProperty(e.location,"origin",{get:n,enumerable:!0})}catch(o){setInterval(function(){e.location.origin=n()},100)}}})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Er)});var qr=Pt((Mt,Nr)=>{/*! + * clipboard.js v2.0.11 + * https://clipboardjs.com/ + * + * Licensed MIT © Zeno Rocha + */(function(t,r){typeof Mt=="object"&&typeof Nr=="object"?Nr.exports=r():typeof define=="function"&&define.amd?define([],r):typeof Mt=="object"?Mt.ClipboardJS=r():t.ClipboardJS=r()})(Mt,function(){return function(){var e={686:function(n,o,i){"use strict";i.d(o,{default:function(){return Ai}});var s=i(279),a=i.n(s),f=i(370),c=i.n(f),u=i(817),p=i.n(u);function m(j){try{return document.execCommand(j)}catch(T){return!1}}var d=function(T){var E=p()(T);return m("cut"),E},h=d;function v(j){var T=document.documentElement.getAttribute("dir")==="rtl",E=document.createElement("textarea");E.style.fontSize="12pt",E.style.border="0",E.style.padding="0",E.style.margin="0",E.style.position="absolute",E.style[T?"right":"left"]="-9999px";var H=window.pageYOffset||document.documentElement.scrollTop;return E.style.top="".concat(H,"px"),E.setAttribute("readonly",""),E.value=j,E}var Y=function(T,E){var H=v(T);E.container.appendChild(H);var I=p()(H);return m("copy"),H.remove(),I},B=function(T){var E=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body},H="";return typeof T=="string"?H=Y(T,E):T instanceof HTMLInputElement&&!["text","search","url","tel","password"].includes(T==null?void 0:T.type)?H=Y(T.value,E):(H=p()(T),m("copy")),H},N=B;function O(j){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?O=function(E){return typeof E}:O=function(E){return E&&typeof Symbol=="function"&&E.constructor===Symbol&&E!==Symbol.prototype?"symbol":typeof E},O(j)}var Qe=function(){var T=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},E=T.action,H=E===void 0?"copy":E,I=T.container,q=T.target,Me=T.text;if(H!=="copy"&&H!=="cut")throw new Error('Invalid "action" value, use either "copy" or "cut"');if(q!==void 0)if(q&&O(q)==="object"&&q.nodeType===1){if(H==="copy"&&q.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if(H==="cut"&&(q.hasAttribute("readonly")||q.hasAttribute("disabled")))throw new Error(`Invalid "target" attribute. You can't cut text from elements with "readonly" or "disabled" attributes`)}else throw new Error('Invalid "target" value, use a valid Element');if(Me)return N(Me,{container:I});if(q)return H==="cut"?h(q):N(q,{container:I})},De=Qe;function $e(j){"@babel/helpers - typeof";return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?$e=function(E){return typeof E}:$e=function(E){return E&&typeof Symbol=="function"&&E.constructor===Symbol&&E!==Symbol.prototype?"symbol":typeof E},$e(j)}function Ei(j,T){if(!(j instanceof T))throw new TypeError("Cannot call a class as a function")}function tn(j,T){for(var E=0;E0&&arguments[0]!==void 0?arguments[0]:{};this.action=typeof I.action=="function"?I.action:this.defaultAction,this.target=typeof I.target=="function"?I.target:this.defaultTarget,this.text=typeof I.text=="function"?I.text:this.defaultText,this.container=$e(I.container)==="object"?I.container:document.body}},{key:"listenClick",value:function(I){var q=this;this.listener=c()(I,"click",function(Me){return q.onClick(Me)})}},{key:"onClick",value:function(I){var q=I.delegateTarget||I.currentTarget,Me=this.action(q)||"copy",kt=De({action:Me,container:this.container,target:this.target(q),text:this.text(q)});this.emit(kt?"success":"error",{action:Me,text:kt,trigger:q,clearSelection:function(){q&&q.focus(),window.getSelection().removeAllRanges()}})}},{key:"defaultAction",value:function(I){return vr("action",I)}},{key:"defaultTarget",value:function(I){var q=vr("target",I);if(q)return document.querySelector(q)}},{key:"defaultText",value:function(I){return vr("text",I)}},{key:"destroy",value:function(){this.listener.destroy()}}],[{key:"copy",value:function(I){var q=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body};return N(I,q)}},{key:"cut",value:function(I){return h(I)}},{key:"isSupported",value:function(){var I=arguments.length>0&&arguments[0]!==void 0?arguments[0]:["copy","cut"],q=typeof I=="string"?[I]:I,Me=!!document.queryCommandSupported;return q.forEach(function(kt){Me=Me&&!!document.queryCommandSupported(kt)}),Me}}]),E}(a()),Ai=Li},828:function(n){var o=9;if(typeof Element!="undefined"&&!Element.prototype.matches){var i=Element.prototype;i.matches=i.matchesSelector||i.mozMatchesSelector||i.msMatchesSelector||i.oMatchesSelector||i.webkitMatchesSelector}function s(a,f){for(;a&&a.nodeType!==o;){if(typeof a.matches=="function"&&a.matches(f))return a;a=a.parentNode}}n.exports=s},438:function(n,o,i){var s=i(828);function a(u,p,m,d,h){var v=c.apply(this,arguments);return u.addEventListener(m,v,h),{destroy:function(){u.removeEventListener(m,v,h)}}}function f(u,p,m,d,h){return typeof u.addEventListener=="function"?a.apply(null,arguments):typeof m=="function"?a.bind(null,document).apply(null,arguments):(typeof u=="string"&&(u=document.querySelectorAll(u)),Array.prototype.map.call(u,function(v){return a(v,p,m,d,h)}))}function c(u,p,m,d){return function(h){h.delegateTarget=s(h.target,p),h.delegateTarget&&d.call(u,h)}}n.exports=f},879:function(n,o){o.node=function(i){return i!==void 0&&i instanceof HTMLElement&&i.nodeType===1},o.nodeList=function(i){var s=Object.prototype.toString.call(i);return i!==void 0&&(s==="[object NodeList]"||s==="[object HTMLCollection]")&&"length"in i&&(i.length===0||o.node(i[0]))},o.string=function(i){return typeof i=="string"||i instanceof String},o.fn=function(i){var s=Object.prototype.toString.call(i);return s==="[object Function]"}},370:function(n,o,i){var s=i(879),a=i(438);function f(m,d,h){if(!m&&!d&&!h)throw new Error("Missing required arguments");if(!s.string(d))throw new TypeError("Second argument must be a String");if(!s.fn(h))throw new TypeError("Third argument must be a Function");if(s.node(m))return c(m,d,h);if(s.nodeList(m))return u(m,d,h);if(s.string(m))return p(m,d,h);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function c(m,d,h){return m.addEventListener(d,h),{destroy:function(){m.removeEventListener(d,h)}}}function u(m,d,h){return Array.prototype.forEach.call(m,function(v){v.addEventListener(d,h)}),{destroy:function(){Array.prototype.forEach.call(m,function(v){v.removeEventListener(d,h)})}}}function p(m,d,h){return a(document.body,m,d,h)}n.exports=f},817:function(n){function o(i){var s;if(i.nodeName==="SELECT")i.focus(),s=i.value;else if(i.nodeName==="INPUT"||i.nodeName==="TEXTAREA"){var a=i.hasAttribute("readonly");a||i.setAttribute("readonly",""),i.select(),i.setSelectionRange(0,i.value.length),a||i.removeAttribute("readonly"),s=i.value}else{i.hasAttribute("contenteditable")&&i.focus();var f=window.getSelection(),c=document.createRange();c.selectNodeContents(i),f.removeAllRanges(),f.addRange(c),s=f.toString()}return s}n.exports=o},279:function(n){function o(){}o.prototype={on:function(i,s,a){var f=this.e||(this.e={});return(f[i]||(f[i]=[])).push({fn:s,ctx:a}),this},once:function(i,s,a){var f=this;function c(){f.off(i,c),s.apply(a,arguments)}return c._=s,this.on(i,c,a)},emit:function(i){var s=[].slice.call(arguments,1),a=((this.e||(this.e={}))[i]||[]).slice(),f=0,c=a.length;for(f;f{"use strict";/*! + * escape-html + * Copyright(c) 2012-2013 TJ Holowaychuk + * Copyright(c) 2015 Andreas Lubbe + * Copyright(c) 2015 Tiancheng "Timothy" Gu + * MIT Licensed + */var rs=/["'&<>]/;Yo.exports=ns;function ns(e){var t=""+e,r=rs.exec(t);if(!r)return t;var n,o="",i=0,s=0;for(i=r.index;i0&&i[i.length-1])&&(c[0]===6||c[0]===2)){r=0;continue}if(c[0]===3&&(!i||c[1]>i[0]&&c[1]=e.length&&(e=void 0),{value:e&&e[n++],done:!e}}};throw new TypeError(t?"Object is not iterable.":"Symbol.iterator is not defined.")}function W(e,t){var r=typeof Symbol=="function"&&e[Symbol.iterator];if(!r)return e;var n=r.call(e),o,i=[],s;try{for(;(t===void 0||t-- >0)&&!(o=n.next()).done;)i.push(o.value)}catch(a){s={error:a}}finally{try{o&&!o.done&&(r=n.return)&&r.call(n)}finally{if(s)throw s.error}}return i}function D(e,t,r){if(r||arguments.length===2)for(var n=0,o=t.length,i;n1||a(m,d)})})}function a(m,d){try{f(n[m](d))}catch(h){p(i[0][3],h)}}function f(m){m.value instanceof et?Promise.resolve(m.value.v).then(c,u):p(i[0][2],m)}function c(m){a("next",m)}function u(m){a("throw",m)}function p(m,d){m(d),i.shift(),i.length&&a(i[0][0],i[0][1])}}function pn(e){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var t=e[Symbol.asyncIterator],r;return t?t.call(e):(e=typeof Ee=="function"?Ee(e):e[Symbol.iterator](),r={},n("next"),n("throw"),n("return"),r[Symbol.asyncIterator]=function(){return this},r);function n(i){r[i]=e[i]&&function(s){return new Promise(function(a,f){s=e[i](s),o(a,f,s.done,s.value)})}}function o(i,s,a,f){Promise.resolve(f).then(function(c){i({value:c,done:a})},s)}}function C(e){return typeof e=="function"}function at(e){var t=function(n){Error.call(n),n.stack=new Error().stack},r=e(t);return r.prototype=Object.create(Error.prototype),r.prototype.constructor=r,r}var It=at(function(e){return function(r){e(this),this.message=r?r.length+` errors occurred during unsubscription: +`+r.map(function(n,o){return o+1+") "+n.toString()}).join(` + `):"",this.name="UnsubscriptionError",this.errors=r}});function Ve(e,t){if(e){var r=e.indexOf(t);0<=r&&e.splice(r,1)}}var Ie=function(){function e(t){this.initialTeardown=t,this.closed=!1,this._parentage=null,this._finalizers=null}return e.prototype.unsubscribe=function(){var t,r,n,o,i;if(!this.closed){this.closed=!0;var s=this._parentage;if(s)if(this._parentage=null,Array.isArray(s))try{for(var a=Ee(s),f=a.next();!f.done;f=a.next()){var c=f.value;c.remove(this)}}catch(v){t={error:v}}finally{try{f&&!f.done&&(r=a.return)&&r.call(a)}finally{if(t)throw t.error}}else s.remove(this);var u=this.initialTeardown;if(C(u))try{u()}catch(v){i=v instanceof It?v.errors:[v]}var p=this._finalizers;if(p){this._finalizers=null;try{for(var m=Ee(p),d=m.next();!d.done;d=m.next()){var h=d.value;try{ln(h)}catch(v){i=i!=null?i:[],v instanceof It?i=D(D([],W(i)),W(v.errors)):i.push(v)}}}catch(v){n={error:v}}finally{try{d&&!d.done&&(o=m.return)&&o.call(m)}finally{if(n)throw n.error}}}if(i)throw new It(i)}},e.prototype.add=function(t){var r;if(t&&t!==this)if(this.closed)ln(t);else{if(t instanceof e){if(t.closed||t._hasParent(this))return;t._addParent(this)}(this._finalizers=(r=this._finalizers)!==null&&r!==void 0?r:[]).push(t)}},e.prototype._hasParent=function(t){var r=this._parentage;return r===t||Array.isArray(r)&&r.includes(t)},e.prototype._addParent=function(t){var r=this._parentage;this._parentage=Array.isArray(r)?(r.push(t),r):r?[r,t]:t},e.prototype._removeParent=function(t){var r=this._parentage;r===t?this._parentage=null:Array.isArray(r)&&Ve(r,t)},e.prototype.remove=function(t){var r=this._finalizers;r&&Ve(r,t),t instanceof e&&t._removeParent(this)},e.EMPTY=function(){var t=new e;return t.closed=!0,t}(),e}();var Sr=Ie.EMPTY;function jt(e){return e instanceof Ie||e&&"closed"in e&&C(e.remove)&&C(e.add)&&C(e.unsubscribe)}function ln(e){C(e)?e():e.unsubscribe()}var Le={onUnhandledError:null,onStoppedNotification:null,Promise:void 0,useDeprecatedSynchronousErrorHandling:!1,useDeprecatedNextContext:!1};var st={setTimeout:function(e,t){for(var r=[],n=2;n0},enumerable:!1,configurable:!0}),t.prototype._trySubscribe=function(r){return this._throwIfClosed(),e.prototype._trySubscribe.call(this,r)},t.prototype._subscribe=function(r){return this._throwIfClosed(),this._checkFinalizedStatuses(r),this._innerSubscribe(r)},t.prototype._innerSubscribe=function(r){var n=this,o=this,i=o.hasError,s=o.isStopped,a=o.observers;return i||s?Sr:(this.currentObservers=null,a.push(r),new Ie(function(){n.currentObservers=null,Ve(a,r)}))},t.prototype._checkFinalizedStatuses=function(r){var n=this,o=n.hasError,i=n.thrownError,s=n.isStopped;o?r.error(i):s&&r.complete()},t.prototype.asObservable=function(){var r=new F;return r.source=this,r},t.create=function(r,n){return new xn(r,n)},t}(F);var xn=function(e){ie(t,e);function t(r,n){var o=e.call(this)||this;return o.destination=r,o.source=n,o}return t.prototype.next=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.next)===null||o===void 0||o.call(n,r)},t.prototype.error=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.error)===null||o===void 0||o.call(n,r)},t.prototype.complete=function(){var r,n;(n=(r=this.destination)===null||r===void 0?void 0:r.complete)===null||n===void 0||n.call(r)},t.prototype._subscribe=function(r){var n,o;return(o=(n=this.source)===null||n===void 0?void 0:n.subscribe(r))!==null&&o!==void 0?o:Sr},t}(x);var Et={now:function(){return(Et.delegate||Date).now()},delegate:void 0};var wt=function(e){ie(t,e);function t(r,n,o){r===void 0&&(r=1/0),n===void 0&&(n=1/0),o===void 0&&(o=Et);var i=e.call(this)||this;return i._bufferSize=r,i._windowTime=n,i._timestampProvider=o,i._buffer=[],i._infiniteTimeWindow=!0,i._infiniteTimeWindow=n===1/0,i._bufferSize=Math.max(1,r),i._windowTime=Math.max(1,n),i}return t.prototype.next=function(r){var n=this,o=n.isStopped,i=n._buffer,s=n._infiniteTimeWindow,a=n._timestampProvider,f=n._windowTime;o||(i.push(r),!s&&i.push(a.now()+f)),this._trimBuffer(),e.prototype.next.call(this,r)},t.prototype._subscribe=function(r){this._throwIfClosed(),this._trimBuffer();for(var n=this._innerSubscribe(r),o=this,i=o._infiniteTimeWindow,s=o._buffer,a=s.slice(),f=0;f0?e.prototype.requestAsyncId.call(this,r,n,o):(r.actions.push(this),r._scheduled||(r._scheduled=ut.requestAnimationFrame(function(){return r.flush(void 0)})))},t.prototype.recycleAsyncId=function(r,n,o){var i;if(o===void 0&&(o=0),o!=null?o>0:this.delay>0)return e.prototype.recycleAsyncId.call(this,r,n,o);var s=r.actions;n!=null&&((i=s[s.length-1])===null||i===void 0?void 0:i.id)!==n&&(ut.cancelAnimationFrame(n),r._scheduled=void 0)},t}(Wt);var Sn=function(e){ie(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t.prototype.flush=function(r){this._active=!0;var n=this._scheduled;this._scheduled=void 0;var o=this.actions,i;r=r||o.shift();do if(i=r.execute(r.state,r.delay))break;while((r=o[0])&&r.id===n&&o.shift());if(this._active=!1,i){for(;(r=o[0])&&r.id===n&&o.shift();)r.unsubscribe();throw i}},t}(Dt);var Oe=new Sn(wn);var M=new F(function(e){return e.complete()});function Vt(e){return e&&C(e.schedule)}function Cr(e){return e[e.length-1]}function Ye(e){return C(Cr(e))?e.pop():void 0}function Te(e){return Vt(Cr(e))?e.pop():void 0}function zt(e,t){return typeof Cr(e)=="number"?e.pop():t}var pt=function(e){return e&&typeof e.length=="number"&&typeof e!="function"};function Nt(e){return C(e==null?void 0:e.then)}function qt(e){return C(e[ft])}function Kt(e){return Symbol.asyncIterator&&C(e==null?void 0:e[Symbol.asyncIterator])}function Qt(e){return new TypeError("You provided "+(e!==null&&typeof e=="object"?"an invalid object":"'"+e+"'")+" where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.")}function zi(){return typeof Symbol!="function"||!Symbol.iterator?"@@iterator":Symbol.iterator}var Yt=zi();function Gt(e){return C(e==null?void 0:e[Yt])}function Bt(e){return un(this,arguments,function(){var r,n,o,i;return $t(this,function(s){switch(s.label){case 0:r=e.getReader(),s.label=1;case 1:s.trys.push([1,,9,10]),s.label=2;case 2:return[4,et(r.read())];case 3:return n=s.sent(),o=n.value,i=n.done,i?[4,et(void 0)]:[3,5];case 4:return[2,s.sent()];case 5:return[4,et(o)];case 6:return[4,s.sent()];case 7:return s.sent(),[3,2];case 8:return[3,10];case 9:return r.releaseLock(),[7];case 10:return[2]}})})}function Jt(e){return C(e==null?void 0:e.getReader)}function U(e){if(e instanceof F)return e;if(e!=null){if(qt(e))return Ni(e);if(pt(e))return qi(e);if(Nt(e))return Ki(e);if(Kt(e))return On(e);if(Gt(e))return Qi(e);if(Jt(e))return Yi(e)}throw Qt(e)}function Ni(e){return new F(function(t){var r=e[ft]();if(C(r.subscribe))return r.subscribe(t);throw new TypeError("Provided object does not correctly implement Symbol.observable")})}function qi(e){return new F(function(t){for(var r=0;r=2;return function(n){return n.pipe(e?A(function(o,i){return e(o,i,n)}):de,ge(1),r?He(t):Dn(function(){return new Zt}))}}function Vn(){for(var e=[],t=0;t=2,!0))}function pe(e){e===void 0&&(e={});var t=e.connector,r=t===void 0?function(){return new x}:t,n=e.resetOnError,o=n===void 0?!0:n,i=e.resetOnComplete,s=i===void 0?!0:i,a=e.resetOnRefCountZero,f=a===void 0?!0:a;return function(c){var u,p,m,d=0,h=!1,v=!1,Y=function(){p==null||p.unsubscribe(),p=void 0},B=function(){Y(),u=m=void 0,h=v=!1},N=function(){var O=u;B(),O==null||O.unsubscribe()};return y(function(O,Qe){d++,!v&&!h&&Y();var De=m=m!=null?m:r();Qe.add(function(){d--,d===0&&!v&&!h&&(p=$r(N,f))}),De.subscribe(Qe),!u&&d>0&&(u=new rt({next:function($e){return De.next($e)},error:function($e){v=!0,Y(),p=$r(B,o,$e),De.error($e)},complete:function(){h=!0,Y(),p=$r(B,s),De.complete()}}),U(O).subscribe(u))})(c)}}function $r(e,t){for(var r=[],n=2;ne.next(document)),e}function K(e,t=document){return Array.from(t.querySelectorAll(e))}function z(e,t=document){let r=ce(e,t);if(typeof r=="undefined")throw new ReferenceError(`Missing element: expected "${e}" to be present`);return r}function ce(e,t=document){return t.querySelector(e)||void 0}function _e(){return document.activeElement instanceof HTMLElement&&document.activeElement||void 0}function tr(e){return L(b(document.body,"focusin"),b(document.body,"focusout")).pipe(ke(1),l(()=>{let t=_e();return typeof t!="undefined"?e.contains(t):!1}),V(e===_e()),J())}function Xe(e){return{x:e.offsetLeft,y:e.offsetTop}}function Kn(e){return L(b(window,"load"),b(window,"resize")).pipe(Ce(0,Oe),l(()=>Xe(e)),V(Xe(e)))}function rr(e){return{x:e.scrollLeft,y:e.scrollTop}}function dt(e){return L(b(e,"scroll"),b(window,"resize")).pipe(Ce(0,Oe),l(()=>rr(e)),V(rr(e)))}var Yn=function(){if(typeof Map!="undefined")return Map;function e(t,r){var n=-1;return t.some(function(o,i){return o[0]===r?(n=i,!0):!1}),n}return function(){function t(){this.__entries__=[]}return Object.defineProperty(t.prototype,"size",{get:function(){return this.__entries__.length},enumerable:!0,configurable:!0}),t.prototype.get=function(r){var n=e(this.__entries__,r),o=this.__entries__[n];return o&&o[1]},t.prototype.set=function(r,n){var o=e(this.__entries__,r);~o?this.__entries__[o][1]=n:this.__entries__.push([r,n])},t.prototype.delete=function(r){var n=this.__entries__,o=e(n,r);~o&&n.splice(o,1)},t.prototype.has=function(r){return!!~e(this.__entries__,r)},t.prototype.clear=function(){this.__entries__.splice(0)},t.prototype.forEach=function(r,n){n===void 0&&(n=null);for(var o=0,i=this.__entries__;o0},e.prototype.connect_=function(){!Wr||this.connected_||(document.addEventListener("transitionend",this.onTransitionEnd_),window.addEventListener("resize",this.refresh),va?(this.mutationsObserver_=new MutationObserver(this.refresh),this.mutationsObserver_.observe(document,{attributes:!0,childList:!0,characterData:!0,subtree:!0})):(document.addEventListener("DOMSubtreeModified",this.refresh),this.mutationEventsAdded_=!0),this.connected_=!0)},e.prototype.disconnect_=function(){!Wr||!this.connected_||(document.removeEventListener("transitionend",this.onTransitionEnd_),window.removeEventListener("resize",this.refresh),this.mutationsObserver_&&this.mutationsObserver_.disconnect(),this.mutationEventsAdded_&&document.removeEventListener("DOMSubtreeModified",this.refresh),this.mutationsObserver_=null,this.mutationEventsAdded_=!1,this.connected_=!1)},e.prototype.onTransitionEnd_=function(t){var r=t.propertyName,n=r===void 0?"":r,o=ba.some(function(i){return!!~n.indexOf(i)});o&&this.refresh()},e.getInstance=function(){return this.instance_||(this.instance_=new e),this.instance_},e.instance_=null,e}(),Gn=function(e,t){for(var r=0,n=Object.keys(t);r0},e}(),Jn=typeof WeakMap!="undefined"?new WeakMap:new Yn,Xn=function(){function e(t){if(!(this instanceof e))throw new TypeError("Cannot call a class as a function.");if(!arguments.length)throw new TypeError("1 argument required, but only 0 present.");var r=ga.getInstance(),n=new La(t,r,this);Jn.set(this,n)}return e}();["observe","unobserve","disconnect"].forEach(function(e){Xn.prototype[e]=function(){var t;return(t=Jn.get(this))[e].apply(t,arguments)}});var Aa=function(){return typeof nr.ResizeObserver!="undefined"?nr.ResizeObserver:Xn}(),Zn=Aa;var eo=new x,Ca=$(()=>k(new Zn(e=>{for(let t of e)eo.next(t)}))).pipe(g(e=>L(ze,k(e)).pipe(R(()=>e.disconnect()))),X(1));function he(e){return{width:e.offsetWidth,height:e.offsetHeight}}function ye(e){return Ca.pipe(S(t=>t.observe(e)),g(t=>eo.pipe(A(({target:r})=>r===e),R(()=>t.unobserve(e)),l(()=>he(e)))),V(he(e)))}function bt(e){return{width:e.scrollWidth,height:e.scrollHeight}}function ar(e){let t=e.parentElement;for(;t&&(e.scrollWidth<=t.scrollWidth&&e.scrollHeight<=t.scrollHeight);)t=(e=t).parentElement;return t?e:void 0}var to=new x,Ra=$(()=>k(new IntersectionObserver(e=>{for(let t of e)to.next(t)},{threshold:0}))).pipe(g(e=>L(ze,k(e)).pipe(R(()=>e.disconnect()))),X(1));function sr(e){return Ra.pipe(S(t=>t.observe(e)),g(t=>to.pipe(A(({target:r})=>r===e),R(()=>t.unobserve(e)),l(({isIntersecting:r})=>r))))}function ro(e,t=16){return dt(e).pipe(l(({y:r})=>{let n=he(e),o=bt(e);return r>=o.height-n.height-t}),J())}var cr={drawer:z("[data-md-toggle=drawer]"),search:z("[data-md-toggle=search]")};function no(e){return cr[e].checked}function Ke(e,t){cr[e].checked!==t&&cr[e].click()}function Ue(e){let t=cr[e];return b(t,"change").pipe(l(()=>t.checked),V(t.checked))}function ka(e,t){switch(e.constructor){case HTMLInputElement:return e.type==="radio"?/^Arrow/.test(t):!0;case HTMLSelectElement:case HTMLTextAreaElement:return!0;default:return e.isContentEditable}}function Ha(){return L(b(window,"compositionstart").pipe(l(()=>!0)),b(window,"compositionend").pipe(l(()=>!1))).pipe(V(!1))}function oo(){let e=b(window,"keydown").pipe(A(t=>!(t.metaKey||t.ctrlKey)),l(t=>({mode:no("search")?"search":"global",type:t.key,claim(){t.preventDefault(),t.stopPropagation()}})),A(({mode:t,type:r})=>{if(t==="global"){let n=_e();if(typeof n!="undefined")return!ka(n,r)}return!0}),pe());return Ha().pipe(g(t=>t?M:e))}function le(){return new URL(location.href)}function ot(e){location.href=e.href}function io(){return new x}function ao(e,t){if(typeof t=="string"||typeof t=="number")e.innerHTML+=t.toString();else if(t instanceof Node)e.appendChild(t);else if(Array.isArray(t))for(let r of t)ao(e,r)}function _(e,t,...r){let n=document.createElement(e);if(t)for(let o of Object.keys(t))typeof t[o]!="undefined"&&(typeof t[o]!="boolean"?n.setAttribute(o,t[o]):n.setAttribute(o,""));for(let o of r)ao(n,o);return n}function fr(e){if(e>999){let t=+((e-950)%1e3>99);return`${((e+1e-6)/1e3).toFixed(t)}k`}else return e.toString()}function so(){return location.hash.substring(1)}function Dr(e){let t=_("a",{href:e});t.addEventListener("click",r=>r.stopPropagation()),t.click()}function Pa(e){return L(b(window,"hashchange"),e).pipe(l(so),V(so()),A(t=>t.length>0),X(1))}function co(e){return Pa(e).pipe(l(t=>ce(`[id="${t}"]`)),A(t=>typeof t!="undefined"))}function Vr(e){let t=matchMedia(e);return er(r=>t.addListener(()=>r(t.matches))).pipe(V(t.matches))}function fo(){let e=matchMedia("print");return L(b(window,"beforeprint").pipe(l(()=>!0)),b(window,"afterprint").pipe(l(()=>!1))).pipe(V(e.matches))}function zr(e,t){return e.pipe(g(r=>r?t():M))}function ur(e,t={credentials:"same-origin"}){return ue(fetch(`${e}`,t)).pipe(fe(()=>M),g(r=>r.status!==200?Ot(()=>new Error(r.statusText)):k(r)))}function We(e,t){return ur(e,t).pipe(g(r=>r.json()),X(1))}function uo(e,t){let r=new DOMParser;return ur(e,t).pipe(g(n=>n.text()),l(n=>r.parseFromString(n,"text/xml")),X(1))}function pr(e){let t=_("script",{src:e});return $(()=>(document.head.appendChild(t),L(b(t,"load"),b(t,"error").pipe(g(()=>Ot(()=>new ReferenceError(`Invalid script: ${e}`))))).pipe(l(()=>{}),R(()=>document.head.removeChild(t)),ge(1))))}function po(){return{x:Math.max(0,scrollX),y:Math.max(0,scrollY)}}function lo(){return L(b(window,"scroll",{passive:!0}),b(window,"resize",{passive:!0})).pipe(l(po),V(po()))}function mo(){return{width:innerWidth,height:innerHeight}}function ho(){return b(window,"resize",{passive:!0}).pipe(l(mo),V(mo()))}function bo(){return G([lo(),ho()]).pipe(l(([e,t])=>({offset:e,size:t})),X(1))}function lr(e,{viewport$:t,header$:r}){let n=t.pipe(ee("size")),o=G([n,r]).pipe(l(()=>Xe(e)));return G([r,t,o]).pipe(l(([{height:i},{offset:s,size:a},{x:f,y:c}])=>({offset:{x:s.x-f,y:s.y-c+i},size:a})))}(()=>{function e(n,o){parent.postMessage(n,o||"*")}function t(...n){return n.reduce((o,i)=>o.then(()=>new Promise(s=>{let a=document.createElement("script");a.src=i,a.onload=s,document.body.appendChild(a)})),Promise.resolve())}var r=class extends EventTarget{constructor(n){super(),this.url=n,this.m=i=>{i.source===this.w&&(this.dispatchEvent(new MessageEvent("message",{data:i.data})),this.onmessage&&this.onmessage(i))},this.e=(i,s,a,f,c)=>{if(s===`${this.url}`){let u=new ErrorEvent("error",{message:i,filename:s,lineno:a,colno:f,error:c});this.dispatchEvent(u),this.onerror&&this.onerror(u)}};let o=document.createElement("iframe");o.hidden=!0,document.body.appendChild(this.iframe=o),this.w.document.open(),this.w.document.write(` + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + + + + + + + +
    + + +
    + +
    + + + + + + +
    +
    + + + +
    +
    +
    + + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    +
    + + + +
    +
    + + + + + + + +

    Changelog

    +

    v0.8.0

    +

    Added

    +
      +
    • Add multi-modal transformers (huggingface-embedding) with windowing options
    • +
    • Add render_page option to pdfminer extractor, for multi-modal PDF features
    • +
    • Add inference utilities (accelerators), with simple mono process support and multi gpu / cpu support
    • +
    • Packaging utils (pipeline.package(...)) to make a pip installable package from a pipeline
    • +
    +

    Changed

    +
      +
    • Updated API to follow EDS-NLP's refactoring
    • +
    • Updated confit to 0.4.2 (better errors) and foldedtensor to 0.3.0 (better multiprocess support)
    • +
    • Removed pipeline.score. You should use pipeline.pipe, a custom scorer and pipeline.select_pipes instead.
    • +
    • Better test coverage
    • +
    • Use hatch instead of setuptools to build the package / docs and run the tests
    • +
    +

    Fixed

    +
      +
    • Fixed attrs dependency only being installed in dev mode
    • +
    +

    v0.7.0

    +

    Major refactoring of the library:

    +

    Core features

    +
      +
    • new pipeline system whose API is inspired by spaCy
    • +
    • first-class support for pytorch
    • +
    • hybrid model inference and training (rules + deep learning)
    • +
    • moved from pandas DataFrame to attrs dataclasses (PDFDoc, Page, Box, ...) for representing PDF documents
    • +
    • new configuration system based on [config][https://github.com/aphp/config], with support for instantiation of complex deep learning models, off-the-shelf CLI, ...
    • +
    +

    Functional features

    +
      +
    • new extractors: pymupdf and poppler (separate packages for licensing reasons)
    • +
    • many deep learning layers (box-transformer, 2d attention with relative position information, ...)
    • +
    • trainable deep learning classifier
    • +
    • training recipes for deep learning models
    • +
    +

    v0.6.3 - 2023-01-23

    +

    Fixed

    +
      +
    • Allow corrupted PDF to not raise an error by default (they are treated as empty PDFs)
    • +
    • Fix classification and aggregation for empty PDFs
    • +
    +

    v0.6.2 - 2022-12-07

    +

    Cast bytes-like extractor inputs as bytes

    +

    v0.6.1 - 2022-12-07

    +

    Performance and cuda related fixes.

    +

    v0.6.0 - 2022-12-05

    +

    Many, many changes: +- added torch as the main deep learning framework instead of spaCy and thinc 🎉 +- added poppler and mupdf as alternatives to pdfminer +- new pipeline / config / registry system to facilitate consistency between training and inference +- standardization of the exchange format between components with dataclass models (attrs more specifically) instead of pandas dataframes

    +

    v0.5.3 - 2022-08-31

    +

    Added

    +
      +
    • Add label mapping parameter to aggregators (to merge different types of blocks such as title and body)
    • +
    • Improved line aggregation formula
    • +
    +

    v0.5.2 - 2022-08-30

    +

    Fixed

    +
      +
    • Fix aggregation for empty documents
    • +
    +

    v0.5.1 - 2022-07-26

    +

    Changed

    +
      +
    • Drop the pdf2image dependency, replacing it with pypdfium2 (easier installation)
    • +
    +

    v0.5.0 - 2022-07-25

    +

    Changed

    +
      +
    • Major refactoring of the library. Moved from concepts (aggregation) to plural names (aggregators).
    • +
    +

    v0.4.3 - 2022-07-20

    +

    Fixed

    +
      +
    • Multi page boxes alignment
    • +
    +

    v0.4.2 - 2022-07-06

    +

    Added

    +
      +
    • package-resource.v1 in the misc registry
    • +
    +

    v0.4.1 - 2022-06-14

    +

    Fixed

    +
      +
    • Remove importlib.metadata dependency, which led to issues with Python 3.7
    • +
    +

    v0.4.0 - 2022-06-14

    +

    Added

    +
      +
    • Python 3.7 support, by relaxing dependency constraints
    • +
    • Support for package-resource pipeline for sklearn-pipeline.v1
    • +
    +

    v0.3.2 - 2022-06-03

    +

    Added

    +
      +
    • compare_results in visualisation
    • +
    +

    v0.3.1 - 2022-06-02

    +

    Fixed

    +
      +
    • Rescale transform now keeps origin on top-left corner
    • +
    +

    v0.3.0 - 2022-06-01

    +

    Added

    +
      +
    • Styles management within the extractor
    • +
    • styled.v1 aggregator, to handle styles
    • +
    • rescale.v1 transform, to go back to the original height and width
    • +
    +

    Changed

    +
      +
    • Styles and text extraction is handled by the extractor directly
    • +
    • The PDFMiner line object is not carried around any more
    • +
    +

    Removed

    +
      +
    • Outdated params entry in the EDS-PDF registry.
    • +
    +

    v0.2.2 - 2022-05-12

    +

    Changed

    +
      +
    • Fixed merge_lines bug when lines were empty
    • +
    • Modified the demo consequently
    • +
    +

    v0.2.1 - 2022-05-09

    +

    Changed

    +
      +
    • The extractor always returns a pandas DataFrame, be it empty. It enhances robustness and stability.
    • +
    +

    v0.2.0 - 2022-05-09

    +

    Added

    +
      +
    • aggregation submodule to handle the specifics of aggregating text blocs
    • +
    • Base classes for better-defined modules
    • +
    • Uniformise the columns to labels
    • +
    • Add arbitrary contextual information
    • +
    +

    Removed

    +
      +
    • typer legacy dependency
    • +
    • models submodule, which handled the configurations for Spark distribution (deferred to another package)
    • +
    • specific orbis context, which was APHP-specific
    • +
    +

    v0.1.0 - 2022-05-06

    +

    Inception ! 🎉

    +

    Features

    +
      +
    • spaCy-like configuration system
    • +
    • Available classifiers :
    • +
    • dummy.v1, that classifies everything to body
    • +
    • mask.v1, for simple rule-based classification
    • +
    • sklearn.v1, that uses a Scikit-Learn pipeline
    • +
    • random.v1, to better sow chaos
    • +
    • Merge different blocs together for easier visualisation
    • +
    • Streamlit demo with visualisation
    • +
    +

      + + + + + + +
      +
      + + +
      + +
      + + + +
      +
      +
      +
      + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/configuration/index.html b/main/configuration/index.html new file mode 100644 index 00000000..6bf6ada9 --- /dev/null +++ b/main/configuration/index.html @@ -0,0 +1,2421 @@ + + + + + + + + + + + + + + + + + + + + + + Configuration - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      + +
      + + + + + + + + +
      + + +
      + +
      + + + + + + +
      +
      + + + +
      +
      +
      + + + + +
      +
      +
      + + + +
      +
      +
      + + + +
      +
      +
      + + + +
      +
      + + + + + + + +

      Configuration

      +

      EDS-PDF is built on top of the confit configuration system.

      +

      The following catalogue registries are included within EDS-PDF:

      + + + + + + + + + + + + + + + + + +
      SectionDescription
      factoryComponents factories (most often classes)
      adapterRaw data preprocessing functions
      +

      EDS-PDF pipelines are meant to be reproducible and serializable, such that you can always define a pipeline through the configuration system.

      +

      To wit, compare the API-based approach to the configuration-based approach (the two are strictly equivalent):

      +
      +
      +
      +
      import edspdf
      +from pathlib import Path
      +
      +model = edspdf.Pipeline()
      +model.add_pipe("pdfminer-extractor", name="extractor")
      +model.add_pipe("mask-classifier", name="classifier", config=dict(
      +    x0=0.2,
      +    x1=0.9,
      +    y0=0.3,
      +    y1=0.6,
      +    threshold=0.1,
      +)
      +model.add_pipe("simple-aggregator", name="aggregator")
      +
      +# Get a PDF
      +pdf = Path("letter.pdf").read_bytes()
      +
      +pdf = model(pdf)
      +
      +str(pdf.aggregated_texts["body"])
      +# Out: Cher Pr ABC, Cher DEF,\n...
      +
      +
      +
      +
      config.cfg
      [pipeline]
      +pipeline = ["extractor", "classifier", "aggregator"]
      +
      +[components.extractor]
      +@factory = "pdfminer-extractor"
      +
      +[components.classifier]
      +@factory = "mask-classifier"
      +x0 = 0.2
      +x1 = 0.9
      +y0 = 0.3
      +y1 = 0.6
      +threshold = 0.1
      +
      +[components.aggregator]
      +@factory = "simple-aggregator"
      +
      +
      import edspdf
      +from pathlib import Path
      +
      +pipeline = edspdf.load("config.cfg")
      +
      +# Get a PDF
      +pdf = Path("letter.pdf").read_bytes()
      +
      +pdf = pipeline(pdf)
      +
      +str(pdf.aggregated_texts["body"])
      +# Out: Cher Pr ABC, Cher DEF,\n...
      +
      +
      +
      +
      +

      The configuration-based approach strictly separates the definition of the pipeline +to its application and avoids tucking away important configuration details. +Changes to the pipeline are transparent as there is a single source of truth: the configuration file.

      +

        + + + + + + +
        +
        + + +
        + +
        + + + +
        +
        +
        +
        + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/contributing/index.html b/main/contributing/index.html new file mode 100644 index 00000000..d0414767 --- /dev/null +++ b/main/contributing/index.html @@ -0,0 +1,2531 @@ + + + + + + + + + + + + + + + + + + + + + + Contributing to EDS-PDF - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
        + +
        + + + + + + + + +
        + + +
        + +
        + + + + + + +
        +
        + + + +
        +
        +
        + + + + +
        +
        +
        + + + +
        +
        +
        + + + +
        +
        +
        + + + +
        +
        + + + + + + + +

        Contributing to EDS-PDF

        +

        We welcome contributions ! There are many ways to help. For example, you can:

        +
          +
        1. Help us track bugs by filing issues
        2. +
        3. Suggest and help prioritise new functionalities
        4. +
        5. Help us make the library as straightforward as possible, by simply asking questions on whatever does not seem clear to you.
        6. +
        +

        Development installation

        +

        To be able to run the test suite and develop your own pipeline, you should clone the repo and install it locally. We use the hatch package manager to manage the project.

        +
        + +
        color:gray # Clone the repository and change directory
        +$ git clone ssh://git@github.com/aphp/edspdf.git
        +---> 100%
        +
        +color:gray # Ensure hatch is installed, preferably via pipx
        +$ pipx install hatch
        +
        +$ cd edspdf
        +
        +color:gray # Enter a shell to develop / test the project. This will install everything required in a virtual environment. You can also `source` the path shown by hatch.
        +$ hatch shell
        +$ ...
        +$ exit  # when you're done
        +
        + +
        + +

        To make sure the pipeline will not fail because of formatting errors, we added pre-commit hooks using the pre-commit Python library. To use it, simply install it:

        +
        + +
        $ pre-commit install
        +
        + +
        + +

        The pre-commit hooks defined in the configuration will automatically run when you commit your changes, letting you know if something went wrong.

        +

        The hooks only run on staged changes. To force-run it on all files, run:

        +
        + +
        $ pre-commit run --all-files
        +---> 100%
        +color:green All good !
        +
        + +
        + +

        Proposing a merge request

        +

        At the very least, your changes should :

        +
          +
        • Be well-documented ;
        • +
        • Pass every tests, and preferably implement its own ;
        • +
        • Follow the style guide.
        • +
        +

        Testing your code

        +

        We use the Pytest test suite.

        +

        The following command will run the test suite. Writing your own tests is encouraged !

        +
        pytest
        +
        +

        Should your contribution propose a bug fix, we require the bug be thoroughly tested.

        +

        Style Guide

        +

        We use Black to reformat the code. While other formatter only enforce PEP8 compliance, Black also makes the code uniform. In short :

        +
        +

        Black reformats entire files in place. It is not configurable.

        +
        +

        Moreover, the CI/CD pipeline enforces a number of checks on the "quality" of the code. To wit, non black-formatted code will make the test pipeline fail. We use pre-commit to keep our codebase clean.

        +

        Refer to the development install tutorial for tips on how to format your files automatically. +Most modern editors propose extensions that will format files on save.

        +

        Documentation

        +

        Make sure to document your improvements, both within the code with comprehensive docstrings, +as well as in the documentation itself if need be.

        +

        We use MkDocs for EDS-PDF's documentation. You can view your changes with

        +
        + +
        color:gray # Run the documentation
        +$ hatch run docs:serve
        +
        + +
        + +

        Go to localhost:8000 to see your changes. MkDocs watches for changes in the documentation folder +and automatically reloads the page.

        +

          + + + + + + +
          +
          + + +
          + +
          + + + +
          +
          +
          +
          + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/data-structures/index.html b/main/data-structures/index.html new file mode 100644 index 00000000..9ee22a06 --- /dev/null +++ b/main/data-structures/index.html @@ -0,0 +1,3302 @@ + + + + + + + + + + + + + + + + + + + + + + Data Structures - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
          + +
          + + + + + + + + +
          + + +
          + +
          + + + + + + +
          +
          + + + +
          +
          +
          + + + + +
          +
          +
          + + + +
          +
          +
          + + + +
          +
          +
          + + + +
          +
          + + + + + + + +

          Data Structures

          +

          EDS-PDF stores PDFs and their annotation in a custom data structures that are +designed to be easy to use and manipulate. We must distinguish between:

          +
            +
          • the data models used to store the PDFs and exchange them between the + different components of EDS-PDF
          • +
          • the tensors structures used to process the PDFs with deep learning models
          • +
          +

          Itinerary of a PDF

          +

          A PDF is first converted to a PDFDoc object, which contains the raw PDF content. This task is usually performed a PDF extractor component. Once the PDF is converted, the same object will be used and updated by the different components, and returned at the end of the pipeline.

          +

          When running a trainable component, the PDFDoc is preprocessed and converted to tensors containing relevant features for the task. This task is performed in the preprocess method of the component. The resulting tensors are then collated together to form a batch, in the collate method of the component. After running the forward method of the component, the tensor predictions are finally assigned as annotations to original PDFDoc objects in the postprocess method.

          +

          Data models

          +

          The main data structure is the [PDFDoc][edspdf.structures.PDFDoc], which represents full a PDF document. It contains the raw PDF content, annotations for the full document, regardless of pages. A PDF is split into Page objects that stores their number, dimension and optionally an image of the rendered page.

          +

          The PDF annotations are stored in Box objects, which represent a rectangular region of the PDF. At the moment, box can only be specialized into TextBox to represent text regions, such as lines extracted by a PDF extractor. Aggregated texts are stored in Text objects, that are not associated with a specific box.

          +

          A TextBox contains a list of TextProperties objects to store the style properties of a styled spans of the text.

          +
          +Reference +
          + +
          + + + + +
          + + + +
          + + + + + + +
          + + + + +

          + PDFDoc + + +

          + + +
          +

          + Bases: BaseModel

          + + +

          This is the main data structure of the library to hold PDFs. +It contains the content of the PDF, as well as box annotations and text outputs.

          + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
          ATTRIBUTEDESCRIPTION
          content +
          +

          The content of the PDF document.

          +
          +

          + + TYPE: + bytes + +

          +
          id +
          +

          The ID of the PDF document.

          +
          +

          + + TYPE: + (str, optional) + +

          +
          pages +
          +

          The pages of the PDF document.

          +
          +

          + + TYPE: + List[Page] + +

          +
          error +
          +

          Whether there was an error when processing this PDF document.

          +
          +

          + + TYPE: + (bool, optional) + +

          +
          content_boxes +
          +

          The content boxes/annotations of the PDF document.

          +
          +

          + + TYPE: + List[Union[TextBox, ImageBox]] + +

          +
          aggregated_texts +
          +

          The aggregated text outputs of the PDF document.

          +
          +

          + + TYPE: + Dict[str, Text] + +

          +
          text_boxes +
          +

          The text boxes of the PDF document.

          +
          +

          + + TYPE: + List[TextBox] + +

          +
          + + + + + +
          + + + + + + + + + + + +
          + +
          + +
          + + + + +

          + Page + + +

          + + +
          +

          + Bases: BaseModel

          + + +

          The Page class represents a page of a PDF document.

          + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
          ATTRIBUTEDESCRIPTION
          page_num +
          +

          The page number of the page.

          +
          +

          + + TYPE: + int + +

          +
          width +
          +

          The width of the page.

          +
          +

          + + TYPE: + float + +

          +
          height +
          +

          The height of the page.

          +
          +

          + + TYPE: + float + +

          +
          doc +
          +

          The PDF document that this page belongs to.

          +
          +

          + + TYPE: + PDFDoc + +

          +
          image +
          +

          The rendered image of the page, stored as a NumPy array.

          +
          +

          + + TYPE: + Optional[ndarray] + +

          +
          text_boxes +
          +

          The text boxes of the page.

          +
          +

          + + TYPE: + List[TextBox] + +

          +
          + + + + + +
          + + + + + + + + + + + +
          + +
          + +
          + + + + +

          + TextProperties + + +

          + + +
          +

          + Bases: BaseModel

          + + +

          The TextProperties class represents the style properties of a span of text in a +TextBox.

          + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
          ATTRIBUTEDESCRIPTION
          italic +
          +

          Whether the text is italic.

          +
          +

          + + TYPE: + bool + +

          +
          bold +
          +

          Whether the text is bold.

          +
          +

          + + TYPE: + bool + +

          +
          begin +
          +

          The beginning index of the span of text.

          +
          +

          + + TYPE: + int + +

          +
          end +
          +

          The ending index of the span of text.

          +
          +

          + + TYPE: + int + +

          +
          fontname +
          +

          The font name of the span of text.

          +
          +

          + + TYPE: + Optional[str] + +

          +
          + + + + + +
          + + + + + + + + + + + +
          + +
          + +
          + + + + +

          + Box + + +

          + + +
          +

          + Bases: BaseModel

          + + +

          The Box class represents a box annotation in a PDF document. It is the base class +of TextBox.

          + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
          ATTRIBUTEDESCRIPTION
          doc +
          +

          The PDF document that this box belongs to.

          +
          +

          + + TYPE: + PDFDoc + +

          +
          page_num +
          +

          The page number of the box.

          +
          +

          + + TYPE: + Optional[int] + +

          +
          x0 +
          +

          The left x-coordinate of the box.

          +
          +

          + + TYPE: + float + +

          +
          x1 +
          +

          The right x-coordinate of the box.

          +
          +

          + + TYPE: + float + +

          +
          y0 +
          +

          The top y-coordinate of the box.

          +
          +

          + + TYPE: + float + +

          +
          y1 +
          +

          The bottom y-coordinate of the box.

          +
          +

          + + TYPE: + float + +

          +
          label +
          +

          The label of the box.

          +
          +

          + + TYPE: + Optional[str] + +

          +
          page +
          +

          The page object that this box belongs to.

          +
          +

          + + TYPE: + Page + +

          +
          + + + + + +
          + + + + + + + + + + + +
          + +
          + +
          + + + + +

          + Text + + +

          + + +
          +

          + Bases: BaseModel

          + + +

          The TextBox class represents text object, not bound to any box.

          +

          It can be used to store aggregated text from multiple boxes for example.

          + + + + + + + + + + + + + + + + + + + + +
          ATTRIBUTEDESCRIPTION
          text +
          +

          The text content.

          +
          +

          + + TYPE: + str + +

          +
          properties +
          +

          The style properties of the text.

          +
          +

          + + TYPE: + List[TextProperties] + +

          +
          + + + + + +
          + + + + + + + + + + + +
          + +
          + +
          + + + + +

          + TextBox + + +

          + + +
          +

          + Bases: Box

          + + +

          The TextBox class represents a text box annotation in a PDF document.

          + + + + + + + + + + + + + + + + + + + + +
          ATTRIBUTEDESCRIPTION
          text +
          +

          The text content of the text box.

          +
          +

          + + TYPE: + str + +

          +
          props +
          +

          The style properties of the text box.

          +
          +

          + + TYPE: + List[TextProperties] + +

          +
          + + + + + +
          + + + + + + + + + + + +
          + +
          + +
          + + + + +
          + +
          + +

          +

          +

          +

          +

          +

          +
          +
          +

          Tensor structure

          +

          The tensors used to process PDFs with deep learning models usually contain 4 main dimensions, in addition to the standard embedding dimensions:

          +
            +
          • samples: one entry per PDF in the batch
          • +
          • pages: one entry per page in a PDF
          • +
          • boxes: one entry per box in a page
          • +
          • token: one entry per token in a box (only for text boxes)
          • +
          +

          These tensors use a special FoldedTensor format to store the data in a compact way and reshape the data depending on the requirements of a layer.

          +

            + + + + + + +
            +
            + + +
            + +
            + + + +
            +
            +
            +
            + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/index.html b/main/index.html new file mode 100644 index 00000000..384e8860 --- /dev/null +++ b/main/index.html @@ -0,0 +1,2538 @@ + + + + + + + + + + + + + + + + + + + + EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
            + +
            + + + + + + + + +
            + + +
            + +
            + + + + + + +
            +
            + + + +
            +
            +
            + + + + +
            +
            +
            + + + +
            +
            +
            + + + +
            +
            +
            + + + +
            +
            + + + + + + + +

            Overview

            +

            EDS-PDF provides modular framework to extract text information from PDF documents.

            +

            You can use it out-of-the-box, or extend it to fit your use-case.

            +

            Getting started

            +

            Installation

            +

            Install the library with pip:

            +
            + +
            $ pip install edspdf
            +---> 100%
            +color:green Installation successful
            +
            + +
            + +

            Extracting text

            +

            Let's build a simple PDF extractor that uses a rule-based classifier. There are two +ways to do this, either by using the configuration system or by using +the pipeline API.

            +
            +
            +
            +

            Create a configuration file:

            +
            config.cfg
            [pipeline]
            +pipeline = ["extractor", "classifier", "aggregator"]
            +
            +[components.extractor]
            +@factory = "pdfminer-extractor"
            +
            +[components.classifier]
            +@factory = "mask-classifier"
            +x0 = 0.2
            +x1 = 0.9
            +y0 = 0.3
            +y1 = 0.6
            +threshold = 0.1
            +
            +[components.aggregator]
            +@factory = "simple-aggregator"
            +
            +

            and load it from Python:

            +
            import edspdf
            +from pathlib import Path
            +
            +model = edspdf.load("config.cfg")  # (1)
            +
            +
            +
            +

            Or create a pipeline directly from Python:

            +
            from edspdf import Pipeline
            +
            +model = Pipeline()
            +model.add_pipe("pdfminer-extractor")
            +model.add_pipe(
            +    "mask-classifier",
            +    config=dict(
            +        x0=0.2,
            +        x1=0.9,
            +        y0=0.3,
            +        y1=0.6,
            +        threshold=0.1,
            +    ),
            +)
            +model.add_pipe("simple-aggregator")
            +
            +
            +
            +
            +

            This pipeline can then be applied (for instance with this PDF):

            +
            # Get a PDF
            +pdf = Path("/Users/perceval/Development/edspdf/tests/resources/letter.pdf").read_bytes()
            +pdf = model(pdf)
            +
            +body = pdf.aggregated_texts["body"]
            +
            +text, style = body.text, body.properties
            +
            +

            See the rule-based recipe for a step-by-step explanation of what is happening.

            +

            Citation

            +

            If you use EDS-PDF, please cite us as below.

            +
            @software{edspdf,
            +  author  = {Dura, Basile and Wajsburt, Perceval and Calliger, Alice and Gérardin, Christel and Bey, Romain},
            +  doi     = {10.5281/zenodo.6902977},
            +  license = {BSD-3-Clause},
            +  title   = {{EDS-PDF: Smart text extraction from PDF documents}},
            +  url     = {https://github.com/aphp/edspdf}
            +}
            +
            +

            Acknowledgement

            +

            We would like to thank Assistance Publique – Hôpitaux de Paris and +AP-HP Foundation for funding this project.

            +

              + + + + + + +
              +
              + + +
              + +
              + + + +
              +
              +
              +
              + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/inference/index.html b/main/inference/index.html new file mode 100644 index 00000000..59b76fb0 --- /dev/null +++ b/main/inference/index.html @@ -0,0 +1,2821 @@ + + + + + + + + + + + + + + + + + + + + + + Inference - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
              + +
              + + + + + + + + +
              + + +
              + +
              + + + + + + +
              +
              + + + +
              +
              +
              + + + + +
              +
              +
              + + + +
              +
              +
              + + + +
              +
              +
              + + + +
              +
              + + + + + + + +

              Inference

              +

              Once you have obtained a pipeline, either by composing rule-based components, training a model or loading a model from the disk, you can use it to make predictions on documents. This is referred to as inference.

              +

              Inference on a single document

              +

              In EDS-PDF, computing the prediction on a single document is done by calling the pipeline on the document. The input can be either:

              +
                +
              • a sequence of bytes
              • +
              • or a PDFDoc object
              • +
              +
              from pathlib import Path
              +
              +pipeline = ...
              +content = Path("path/to/.pdf").read_bytes()
              +doc = pipeline(content)
              +
              +

              If you're lucky enough to have a GPU, you can use it to speed up inference by moving the model to the GPU before calling the pipeline. To leverage multiple GPUs, refer to the multiprocessing accelerator description below.

              +
              pipeline.to("cuda")  # same semantics as pytorch
              +doc = pipeline(content)
              +
              +

              Inference on multiple documents

              +

              When processing multiple documents, it is usually more efficient to use the pipeline.pipe(...) method, especially when using deep learning components, since this allow matrix multiplications to be batched together. Depending on your computational resources and requirements, EDS-PDF comes with various "accelerators" to speed up inference (see the Accelerators section for more details). By default, the .pipe() method uses the simple accelerator but you can switch to a different one by passing the accelerator argument.

              +
              pipeline = ...
              +docs = pipeline.pipe(
              +    [content1, content2, ...],
              +    batch_size=16,  # optional, default to the one defined in the pipeline
              +    accelerator=my_accelerator,
              +)
              +
              +

              The pipe method supports the following arguments :

              + + +
              + + + +
              + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
              PARAMETERDESCRIPTION
              inputs +

              The inputs to create the PDFDocs from, or the PDFDocs directly.

              +

              + + TYPE: + Any + +

              +
              batch_size +

              The batch size to use. If not provided, the batch size of the pipeline +object will be used.

              +

              + + TYPE: + Optional[int] + + + DEFAULT: + None + +

              +
              accelerator +

              The accelerator to use for processing the documents. If not provided, +the default accelerator will be used.

              +

              + + TYPE: + Optional[Union[str, Accelerator]] + + + DEFAULT: + None + +

              +
              to_doc +

              The function to use to convert the inputs to PDFDoc objects. By default, +the content field of the inputs will be used if dict-like objects are +provided, otherwise the inputs will be passed directly to the pipeline.

              +

              + + TYPE: + Optional[ToDoc] + + + DEFAULT: + None + +

              +
              from_doc +

              The function to use to convert the PDFDoc objects to outputs. By default, +the PDFDoc objects will be returned directly.

              +

              + + TYPE: + FromDoc + + + DEFAULT: + lambda : doc + +

              +
              + +
              + +

              Accelerators

              +

              Simple accelerator

              +
              + + + +

              This is the simplest accelerator which batches the documents and process each batch +on the main process (the one calling .pipe()).

              +

              Examples

              +
              docs = list(pipeline.pipe([content1, content2, ...]))
              +
              +

              or, if you want to override the model defined batch size

              +
              docs = list(pipeline.pipe([content1, content2, ...], batch_size=8))
              +
              +

              which is equivalent to passing a confit dict

              +
              docs = list(
              +    pipeline.pipe(
              +        [content1, content2, ...],
              +        accelerator={
              +            "@accelerator": "simple",
              +            "batch_size": 8,
              +        },
              +    )
              +)
              +
              +

              or the instantiated accelerator directly

              +
              from edspdf.accelerators.simple import SimpleAccelerator
              +
              +accelerator = SimpleAccelerator(batch_size=8)
              +docs = list(pipeline.pipe([content1, content2, ...], accelerator=accelerator))
              +
              +

              If you have a GPU, make sure to move the model to the appropriate device before +calling .pipe(). If you have multiple GPUs, use the +multiprocessing +accelerator instead.

              +
              pipeline.to("cuda")
              +docs = list(pipeline.pipe([content1, content2, ...]))
              +
              + + + + + + + + + + + + + + +
              PARAMETERDESCRIPTION
              batch_size +

              The number of documents to process in each batch.

              +

              + + TYPE: + int + + + DEFAULT: + 32 + +

              +
              + + + +

              Multiprocessing accelerator

              +
              + + + +

              If you have multiple CPU cores, and optionally multiple GPUs, we provide a +multiprocessing accelerator that allows to run the inference on multiple +processes.

              +

              This accelerator dispatches the batches between multiple workers +(data-parallelism), and distribute the computation of a given batch on one or two +workers (model-parallelism). This is done by creating two types of workers:

              +
                +
              • a CPUWorker which handles the non deep-learning components and the + preprocessing, collating and postprocessing of deep-learning components
              • +
              • a GPUWorker which handles the forward call of the deep-learning components
              • +
              +

              The advantage of dedicating a worker to the deep-learning components is that it +allows to prepare multiple batches in parallel in multiple CPUWorker, and ensure +that the GPUWorker never wait for a batch to be ready.

              +

              The overall architecture described in the following figure, for 3 CPU workers and 2 +GPU workers.

              +
              + +
              + +

              Here is how a small pipeline with rule-based components and deep-learning components +is distributed between the workers:

              +
              + +
              +

              Examples

              +
              docs = list(
              +    pipeline.pipe(
              +        [content1, content2, ...],
              +        accelerator={
              +            "@accelerator": "multiprocessing",
              +            "num_cpu_workers": 3,
              +            "num_gpu_workers": 2,
              +            "batch_size": 8,
              +        },
              +    )
              +)
              +
              + + + + + + + + + + + + + + + + + + + + + + + + + + +
              PARAMETERDESCRIPTION
              batch_size +

              Number of documents to process at a time in a CPU/GPU worker

              +

              + + TYPE: + int + +

              +
              num_cpu_workers +

              Number of CPU workers. A CPU worker handles the non deep-learning components +and the preprocessing, collating and postprocessing of deep-learning components.

              +

              + + TYPE: + Optional[int] + + + DEFAULT: + None + +

              +
              num_gpu_workers +

              Number of GPU workers. A GPU worker handles the forward call of the +deep-learning components.

              +

              + + TYPE: + Optional[int] + + + DEFAULT: + None + +

              +
              gpu_pipe_names +

              List of pipe names to accelerate on a GPUWorker, defaults to all pipes +that inherit from TrainablePipe

              +

              + + TYPE: + Optional[List[str]] + + + DEFAULT: + None + +

              +
              + + + +
              +

                + + + + + + +
                +
                + + +
                + +
                + + + +
                +
                +
                +
                + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/layers/box-transformer-layer/index.html b/main/layers/box-transformer-layer/index.html new file mode 100644 index 00000000..8f143cf9 --- /dev/null +++ b/main/layers/box-transformer-layer/index.html @@ -0,0 +1,2691 @@ + + + + + + + + + + + + + + + + + + + + + + BoxTransformerLayer - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                + +
                + + + + + + + + +
                + + +
                + +
                + + + + + + +
                +
                + + + +
                +
                +
                + + + + +
                +
                +
                + + + +
                +
                +
                + + + +
                +
                +
                + + + +
                +
                + + + + + + + +

                BoxTransformerLayer

                +
                + + + + +
                + + +

                BoxTransformerLayer combining a self attention layer and a +linear->activation->linear transformation. This layer is used in the +BoxTransformerModule module.

                + + + +

                Parameters

                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                PARAMETERDESCRIPTION
                input_size +

                Input embedding size

                +

                + + TYPE: + int + +

                +
                num_heads +

                Number of attention heads in the attention layer

                +

                + + TYPE: + int + + + DEFAULT: + 2 + +

                +
                dropout_p +

                Dropout probability both for the attention layer and embedding projections

                +

                + + TYPE: + float + + + DEFAULT: + 0.0 + +

                +
                head_size +

                Head sizes of the attention layer

                +

                + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                +
                activation +

                Activation function used in the linear->activation->linear transformation

                +

                + + TYPE: + ActivationFunction + + + DEFAULT: + 'gelu' + +

                +
                init_resweight +

                Initial weight of the residual gates. +At 0, the layer acts (initially) as an identity function, and at 1 as +a standard Transformer layer. +Initializing with a value close to 0 can help the training converge.

                +

                + + TYPE: + float + + + DEFAULT: + 0.0 + +

                +
                attention_mode +

                Mode of relative position infused attention layer. +See the +relative attention +documentation for more information.

                +

                + + TYPE: + Sequence[Literal['c2c', 'c2p', 'p2c']] + + + DEFAULT: + ('c2c', 'c2p', 'p2c') + +

                +
                position_embedding +

                Position embedding to use as key/query position embedding in the attention +computation.

                +

                + + TYPE: + Optional[Union[FloatTensor, Parameter]] + + + DEFAULT: + None + +

                +
                + + + + +
                + + + + + + + + + +
                + + + +

                +forward + +

                + + +
                + +

                Forward pass of the BoxTransformerLayer

                + + + + + + + + + + + + + + + + + + + + + + + + + + +
                PARAMETERDESCRIPTION
                embeds +

                Embeddings to contextualize +Shape: n_samples * n_keys * input_size

                +

                + + TYPE: + FloatTensor + +

                +
                mask +

                Mask of the embeddings. 0 means padding element. +Shape: n_samples * n_keys

                +

                + + TYPE: + BoolTensor + +

                +
                relative_positions +

                Position of the keys relatively to the query elements +Shape: n_samples * n_queries * n_keys * n_coordinates (2 for x/y)

                +

                + + TYPE: + LongTensor + +

                +
                no_position_mask +

                Key / query pairs for which the position attention terms should +be disabled. +Shape: n_samples * n_queries * n_keys

                +

                + + TYPE: + Optional[BoolTensor] + + + DEFAULT: + None + +

                +
                + + + + + + + + + + + + + + + + +
                RETURNSDESCRIPTION
                + + Tuple[FloatTensor, FloatTensor] + + +
                +
                  +
                • Contextualized embeddings + Shape: n_samples * n_queries * n_keys
                • +
                • Attention logits + Shape: n_samples * n_queries * n_keys * n_heads
                • +
                +
                +
                + +
                + +
                + + + +
                + +
                + +
                +

                  + + + + + + +
                  +
                  + + +
                  + +
                  + + + +
                  +
                  +
                  +
                  + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/layers/box-transformer/index.html b/main/layers/box-transformer/index.html new file mode 100644 index 00000000..ad221f1a --- /dev/null +++ b/main/layers/box-transformer/index.html @@ -0,0 +1,2679 @@ + + + + + + + + + + + + + + + + + + + + + + BoxTransformerModule - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                  + +
                  + + + + + + + + +
                  + + +
                  + +
                  + + + + + + +
                  +
                  + + + +
                  +
                  +
                  + + + + +
                  +
                  +
                  + + + +
                  +
                  +
                  + + + +
                  +
                  +
                  + + + +
                  +
                  + + + + + + + +

                  BoxTransformerModule

                  +
                  + + + + +
                  + + + +

                  Box Transformer architecture combining a multiple +BoxTransformerLayer +modules. It is mainly used in +BoxTransformer.

                  + +

                  Parameters

                  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                  PARAMETERDESCRIPTION
                  input_size +

                  Input embedding size

                  +

                  + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                  +
                  num_heads +

                  Number of attention heads in the attention layers

                  +

                  + + TYPE: + int + + + DEFAULT: + 2 + +

                  +
                  n_relative_positions +

                  Maximum range of embeddable relative positions between boxes (further +distances are capped to ±n_relative_positions // 2)

                  +

                  + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                  +
                  dropout_p +

                  Dropout probability both for the attention layers and embedding projections

                  +

                  + + TYPE: + float + + + DEFAULT: + 0.0 + +

                  +
                  head_size +

                  Head sizes of the attention layers

                  +

                  + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                  +
                  activation +

                  Activation function used in the linear->activation->linear transformations

                  +

                  + + TYPE: + ActivationFunction + + + DEFAULT: + 'gelu' + +

                  +
                  init_resweight +

                  Initial weight of the residual gates. +At 0, the layer acts (initially) as an identity function, and at 1 as +a standard Transformer layer. +Initializing with a value close to 0 can help the training converge.

                  +

                  + + TYPE: + float + + + DEFAULT: + 0.0 + +

                  +
                  attention_mode +

                  Mode of relative position infused attention layer. +See the +relative attention +documentation for more information.

                  +

                  + + TYPE: + Sequence[Literal['c2c', 'c2p', 'p2c']] + + + DEFAULT: + ('c2c', 'c2p', 'p2c') + +

                  +
                  n_layers +

                  Number of layers in the Transformer

                  +

                  + + TYPE: + int + + + DEFAULT: + 2 + +

                  +
                  + + + + +
                  + + + + + + + + + +
                  + + + +

                  +forward + +

                  + + +
                  + +

                  Forward pass of the BoxTransformer

                  + + + + + + + + + + + + + + + + + + +
                  PARAMETERDESCRIPTION
                  embeds +

                  Embeddings to contextualize +Shape: n_samples * n_keys * input_size

                  +

                  + + TYPE: + FoldedTensor + +

                  +
                  boxes +

                  Layout features of the input elements

                  +

                  + + TYPE: + Dict + +

                  +
                  + + + + + + + + + + + + + + + + +
                  RETURNSDESCRIPTION
                  + + Tuple[FloatTensor, List[FloatTensor]] + + +
                  +
                    +
                  • Output of the last BoxTransformerLayer + Shape: n_samples * n_queries * n_keys
                  • +
                  • Attention logits of all layers + Shape: n_samples * n_queries * n_keys * n_heads
                  • +
                  +
                  +
                  + +
                  + +
                  + + + +
                  + +
                  + +
                  +

                    + + + + + + +
                    +
                    + + +
                    + +
                    + + + +
                    +
                    +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/layers/index.html b/main/layers/index.html new file mode 100644 index 00000000..494a0449 --- /dev/null +++ b/main/layers/index.html @@ -0,0 +1,2361 @@ + + + + + + + + + + + + + + + + + + + + + + Deep learning layers - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    + +
                    + + + + + + + + +
                    + + +
                    + +
                    + + + + + + +
                    +
                    + + + +
                    +
                    +
                    + + + + +
                    +
                    +
                    + + + +
                    +
                    +
                    + + + +
                    +
                    +
                    + + + +
                    +
                    + + + + + + + +

                    Deep learning layers

                    +

                    EDS-PDF provides a set of specialized deep learning layers that can be used to build trainable +components. These layers are built on top of the PyTorch framework and can be used in +any PyTorch model.

                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    LayerDescription
                    BoxTransformerModuleContextualize box embeddings with a 2d Transformer with relative position representations
                    BoxTransformerLayerA single layer of the above BoxTransformerModule layer
                    RelativeAttentionA 2d attention layer that optionally uses relative position to compute its attention scores
                    SinusoidalEmbeddingA position embedding that uses trigonometric functions to encode positions
                    VocabularyA non deep learning layer to encodes / decode vocabularies
                    +

                      + + + + + + +
                      +
                      + + +
                      + +
                      + + + +
                      +
                      +
                      +
                      + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/layers/relative-attention/index.html b/main/layers/relative-attention/index.html new file mode 100644 index 00000000..10fde378 --- /dev/null +++ b/main/layers/relative-attention/index.html @@ -0,0 +1,2891 @@ + + + + + + + + + + + + + + + + + + + + + + RelativeAttention - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      + +
                      + + + + + + + + +
                      + + +
                      + +
                      + + + + + + +
                      +
                      + + + +
                      +
                      +
                      + + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      +
                      + + + +
                      +
                      + + + + + + + +

                      RelativeAttention

                      +
                      + + + + +
                      + + +

                      A self/cross-attention layer that takes relative position of elements into +account to compute the attention weights. +When running a relative attention layer, key and queries are represented using +content and position embeddings, where position embeddings are retrieved using +the relative position of keys relative to queries

                      + + + +

                      Parameters

                      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      PARAMETERDESCRIPTION
                      size +

                      The size of the output embeddings +Also serves as default if query_size, pos_size, or key_size is None

                      +

                      + + TYPE: + int + +

                      +
                      n_heads +

                      The number of attention heads

                      +

                      + + TYPE: + int + +

                      +
                      query_size +

                      The size of the query embeddings.

                      +

                      + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                      +
                      key_size +

                      The size of the key embeddings.

                      +

                      + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                      +
                      value_size +

                      The size of the value embeddings

                      +

                      + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                      +
                      head_size +

                      The size of each query / key / value chunk used in the attention dot product +Default: key_size / n_heads

                      +

                      + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                      +
                      position_embedding +

                      The position embedding used as key and query embeddings

                      +

                      + + TYPE: + Optional[Union[FloatTensor, Parameter]] + + + DEFAULT: + None + +

                      +
                      dropout_p +

                      Dropout probability applied on the attention weights +Default: 0.1

                      +

                      + + TYPE: + float + + + DEFAULT: + 0.0 + +

                      +
                      same_key_query_proj +

                      Whether to use the same projection operator for content key and queries +when computing the pre-attention key and query embedding chunks +Default: False

                      +

                      + + TYPE: + bool + + + DEFAULT: + False + +

                      +
                      same_positional_key_query_proj +

                      Whether to use the same projection operator for content key and queries +when computing the pre-attention key and query embedding chunks +Default: False

                      +

                      + + TYPE: + bool + + + DEFAULT: + False + +

                      +
                      n_coordinates +

                      The number of positional coordinates +For instance, text is 1D so 1 coordinate, images are 2D so 2 coordinates ... +Default: 1

                      +

                      + + TYPE: + int + + + DEFAULT: + 1 + +

                      +
                      head_bias +

                      Whether to learn a bias term to add to the attention logits +This is only useful if you plan to use the attention logits for subsequent +operations, since attention weights are unaffected by bias terms.

                      +

                      + + TYPE: + bool + + + DEFAULT: + True + +

                      +
                      do_pooling +

                      Whether to compute the output embedding. +If you only plan to use attention logits, you should disable this parameter. +Default: True

                      +

                      + + TYPE: + bool + + + DEFAULT: + True + +

                      +
                      mode +

                      Whether to compute content to content (c2c), content to position (c2p) +or position to content (p2c) attention terms. +Setting mode=('c2c") disable relative position attention terms: this is +the standard attention layer. +To get a better intuition about these different types of attention, here is +a formulation as fictitious search samples from a word in a (1D) text:

                      +
                        +
                      • content-content : "my content is ’ultrasound’ so I’m looking for other + words whose content contains information about temporality"
                      • +
                      • content-position: "my content is ’ultrasound’ so I’m looking for other + words that are 3 positions after of me"
                      • +
                      • position-content : "regardless of my content, I will attend to the word + one position after from me if it contains information about temporality, + two words after me if it contains information about location, etc."
                      • +
                      +

                      + + TYPE: + Sequence[Literal['c2c', 'c2p', 'p2c']] + + + DEFAULT: + ('c2c', 'p2c', 'c2p') + +

                      +
                      n_additional_heads +

                      The number of additional head logits to compute. +Those are not used to compute output embeddings, but may be useful in +subsequent operation. +Default: 0

                      +

                      + + TYPE: + int + + + DEFAULT: + 0 + +

                      +
                      + + + + +
                      + + + + + + + + + +
                      + + + +

                      +forward + +

                      + + +
                      + +

                      Forward pass of the RelativeAttention layer.

                      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      PARAMETERDESCRIPTION
                      content_queries +

                      The content query embedding to use in the attention computation +Shape: n_samples * n_queries * query_size

                      +

                      + + TYPE: + FloatTensor + +

                      +
                      content_keys +

                      The content key embedding to use in the attention computation. +If None, defaults to the content_queries +Shape: n_samples * n_keys * query_size

                      +

                      + + TYPE: + Optional[FloatTensor] + + + DEFAULT: + None + +

                      +
                      content_values +

                      The content values embedding to use in the final pooling computation. +If None, pooling won't be performed. +Shape: n_samples * n_keys * query_size

                      +

                      + + TYPE: + Optional[FloatTensor] + + + DEFAULT: + None + +

                      +
                      mask +

                      The content key embedding to use in the attention computation. +If None, defaults to the content_queries +Shape: either +- n_samples * n_keys +- n_samples * n_queries * n_keys +- n_samples * n_queries * n_keys * n_heads

                      +

                      + + TYPE: + Optional[BoolTensor] + + + DEFAULT: + None + +

                      +
                      relative_positions +

                      The relative position of keys relative to queries +If None, positional attention terms won't be computed. +Shape: n_samples * n_queries * n_keys * n_coordinates

                      +

                      + + TYPE: + Optional[LongTensor] + + + DEFAULT: + None + +

                      +
                      no_position_mask +

                      Key / query pairs for which the position attention terms should +be disabled. +Shape: n_samples * n_queries * n_keys

                      +

                      + + TYPE: + Optional[BoolTensor] + + + DEFAULT: + None + +

                      +
                      base_attn +

                      Attention logits to add to the computed attention logits +Shape: n_samples * n_queries * n_keys * n_heads

                      +

                      + + TYPE: + Optional[FloatTensor] + + + DEFAULT: + None + +

                      +
                      + + + + + + + + + + + + + + + + +
                      RETURNSDESCRIPTION
                      + + Union[Tuple[FloatTensor, FloatTensor], FloatTensor] + + +
                      +
                        +
                      • the output contextualized embeddings (only if content_values is not None + and the do_pooling attribute is set to True) + Shape: n_sample * n_keys * size
                      • +
                      • the attention logits + Shape: n_sample * n_keys * n_queries * (n_heads + n_additional_heads)
                      • +
                      +
                      +
                      + +
                      + +
                      + + + +
                      + +
                      + +
                      +

                        + + + + + + +
                        +
                        + + +
                        + +
                        + + + +
                        +
                        +
                        +
                        + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/layers/sinusoidal-embedding/index.html b/main/layers/sinusoidal-embedding/index.html new file mode 100644 index 00000000..499fe95d --- /dev/null +++ b/main/layers/sinusoidal-embedding/index.html @@ -0,0 +1,2557 @@ + + + + + + + + + + + + + + + + + + + + + + SinusoidalEmbedding - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                        + +
                        + + + + + + + + +
                        + + +
                        + +
                        + + + + + + +
                        +
                        + + + +
                        +
                        +
                        + + + + +
                        +
                        +
                        + + + +
                        +
                        +
                        + + + +
                        +
                        +
                        + + + +
                        +
                        + + + + + + + +

                        SinusoidalEmbedding

                        +
                        + + + + +
                        + + +

                        A position embedding lookup table that stores embeddings for a fixed number +of positions. +The value of each of the embedding_dim channels of the generated embedding +is generated according to a trigonometric function (sin for even channels, +cos for odd channels). +The frequency of the signal in each pair of channels varies according to the +temperature parameter.

                        +

                        Any input position above the maximum value num_embeddings will be capped to +num_embeddings - 1

                        + + + +

                        Parameters

                        + + + + + + + + + + + + + + + + + + + + + +
                        PARAMETERDESCRIPTION
                        num_embeddings +

                        The maximum number of position embeddings store in this table

                        +

                        + + TYPE: + int + +

                        +
                        embedding_dim +

                        The embedding size

                        +

                        + + TYPE: + int + +

                        +
                        temperature +

                        The temperature controls the range of frequencies used by each +channel of the embedding

                        +

                        + + TYPE: + float + + + DEFAULT: + 10000.0 + +

                        +
                        + + + + +
                        + + + + + + + + + +
                        + + + +

                        +forward + +

                        + + +
                        + +

                        Forward pass of the SinusoidalEmbedding module

                        + + + + + + + + + + + + + + +
                        PARAMETERDESCRIPTION
                        indices +

                        Shape: any

                        +

                        + + TYPE: + LongTensor + +

                        +
                        + + + + + + + + + + + + + + + + +
                        RETURNSDESCRIPTION
                        + + FloatTensor + + +
                        +

                        Shape: (*input_shape, embedding_dim)

                        +
                        +
                        + +
                        + +
                        + + + +
                        + +
                        + +
                        +

                          + + + + + + +
                          +
                          + + +
                          + +
                          + + + +
                          +
                          +
                          +
                          + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/layers/vocabulary/index.html b/main/layers/vocabulary/index.html new file mode 100644 index 00000000..4300b804 --- /dev/null +++ b/main/layers/vocabulary/index.html @@ -0,0 +1,2679 @@ + + + + + + + + + + + + + + + + + + + + + + Vocabulary - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                          + +
                          + + + + + + + + +
                          + + +
                          + +
                          + + + + + + +
                          +
                          + + + +
                          +
                          +
                          + + + + +
                          +
                          +
                          + + + +
                          +
                          +
                          + + + +
                          +
                          +
                          + + + +
                          +
                          + + + + + + + +

                          Vocabulary

                          +
                          + + + + +
                          + + +

                          Vocabulary layer. +This is not meant to be used as a torch.nn.Module but subclassing +torch.nn.Module makes the instances appear when printing a model, which is nice.

                          + + + +

                          Parameters

                          + + + + + + + + + + + + + + + + + +
                          PARAMETERDESCRIPTION
                          items +

                          Initial vocabulary elements if any. +Specific elements such as padding and unk can be set here to enforce their +index in the vocabulary.

                          +

                          + + TYPE: + Sequence[T] + + + DEFAULT: + None + +

                          +
                          default +

                          Default index to use for out of vocabulary elements +Defaults to -100

                          +

                          + + TYPE: + int + + + DEFAULT: + -100 + +

                          +
                          + + + + +
                          + + + + + + + +

                          Functions

                          + +
                          + + + +

                          +initialization + +

                          + + +
                          + +

                          Enters the initialization mode. +Out of vocabulary elements will be assigned an index.

                          + +
                          + +
                          + +
                          + + + +

                          +encode + +

                          + + +
                          + +

                          Converts an element into its vocabulary index +If the layer is in its initialization mode (with vocab.initialization(): ...), +and the element is out of vocabulary, a new index will be created and returned. +Otherwise, any oov element will be encoded with the default index.

                          + + + + + + + + + + + + + + +
                          PARAMETERDESCRIPTION
                          item + +

                          +

                          +
                          + + + + + + + + + + + + + + + + +
                          RETURNSDESCRIPTION
                          + + int + + +
                          + +
                          +
                          + +
                          + +
                          + +
                          + + + +

                          +decode + +

                          + + +
                          + +

                          Converts an index into its original value

                          + + + + + + + + + + + + + + +
                          PARAMETERDESCRIPTION
                          idx + +

                          +

                          +
                          + + + + + + + + + + + + + + + + +
                          RETURNSDESCRIPTION
                          + + InputT + + +
                          + +
                          +
                          + +
                          + +
                          + + + +
                          + +
                          + +
                          +

                            + + + + + + +
                            +
                            + + +
                            + +
                            + + + +
                            +
                            +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/objects.inv b/main/objects.inv new file mode 100644 index 00000000..4e643960 Binary files /dev/null and b/main/objects.inv differ diff --git a/main/pipeline/index.html b/main/pipeline/index.html new file mode 100644 index 00000000..d01afbd1 --- /dev/null +++ b/main/pipeline/index.html @@ -0,0 +1,2492 @@ + + + + + + + + + + + + + + + + + + + + + + Pipeline - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            + +
                            + + + + + + + + +
                            + + +
                            + +
                            + + + + + + +
                            +
                            + + + +
                            +
                            +
                            + + + + +
                            +
                            +
                            + + + +
                            +
                            +
                            + + + +
                            +
                            +
                            + + + +
                            +
                            + + + + + + + +

                            Pipeline

                            +

                            The goal of EDS-PDF is to provide a framework for processing PDF documents, along with some utilities and a few components, stitched together by a robust pipeline and configuration system.

                            +

                            Processing PDFs usually involves many steps such as extracting lines, running OCR models, detecting and classifying boxes, filtering and aggregating parts of the extracted texts, etc. Organising these steps together, combining static and deep learning components, while remaining modular and efficient is a challenge. This is why EDS-PDF is built on top of a new pipelining system.

                            +
                            +

                            Deep learning frameworks

                            +

                            The EDS-PDF trainable components are built around the PyTorch framework. While you +can use any technology in static components, we do not provide tools to train +components built with other deep learning frameworks.

                            +
                            +

                            Creating a pipeline

                            +

                            A pipe is a processing block (like a function) that applies a transformation on its input and returns a modified object.

                            +

                            At the moment, four types of pipes are implemented in the library:

                            +
                              +
                            1. extraction components extract lines from a raw PDF and return a PDFDoc object filled with these text boxes.
                            2. +
                            3. classification components classify each box with labels, such as body, header, footer...
                            4. +
                            5. aggregation components compiles the lines together according to their classes to re-create the original text.
                            6. +
                            7. embedding components don't directly update the annotations on the document but have specific deep-learning methods (see the TrainablePipe page) that can be composed to form a machine learning model.
                            8. +
                            +

                            To create your first pipeline, execute the following code:

                            +
                            from edspdf import Pipeline
                            +
                            +model = Pipeline()
                            +# will extract text lines from a document
                            +model.add_pipe(
                            +    "pdfminer-extractor",
                            +    config=dict(
                            +        extract_style=False,
                            +    ),
                            +)
                            +# classify everything inside the `body` bounding box as `body`
                            +model.add_pipe(
                            +    "mask-classifier", config=dict(body={"x0": 0.1, "y0": 0.1, "x1": 0.9, "y1": 0.9})
                            +)
                            +# aggregates the lines together to re-create the original text
                            +model.add_pipe("simple-aggregator")
                            +
                            +

                            This pipeline can then be run on one or more PDF documents. +As the pipeline process documents, components will be called in the order +they were added to the pipeline.

                            +
                            from pathlib import Path
                            +
                            +pdf_bytes = Path("path/to/your/pdf").read_bytes()
                            +
                            +# Processing one document
                            +model(pdf_bytes)
                            +
                            +# Processing multiple documents
                            +model.pipe([pdf_bytes, ...])
                            +
                            +

                            For more information on how to use the pipeline, refer to the Inference page.

                            +

                            Hybrid models

                            +

                            EDS-PDF was designed to facilitate the training and inference of hybrid models that +arbitrarily chain static components or trained deep learning components. Static components are callable objects that take a PDFDoc object as input, perform arbitrary transformations over the input, and return the modified object. Trainable pipes, on the other hand, allow for deep learning operations to be performed on the PDFDoc object and must be trained to be used.

                            +

                            Saving and loading a pipeline

                            +

                            Pipelines can be saved and loaded using the save and load methods. The saved pipeline is not a pickled objet but a folder containing the config file, the weights and extra resources for each pipeline. This allows for easy inspection and modification of the pipeline, and avoids the execution of arbitrary code when loading a pipeline.

                            +
                            model.save("path/to/your/model")
                            +model = edspdf.load("path/to/your/model")
                            +
                            +

                            To share the pipeline and turn it into a pip installable package, you can use the package method, which will use or create a pyproject.toml file, fill it accordingly, and create a wheel file. At the moment, we only support the poetry package manager.

                            +
                            model.package(
                            +    name="your-package-name",  # leave None to reuse name in pyproject.toml
                            +    version="0.0.1",
                            +    root_dir="path/to/project/root",  # optional, to retrieve an existing pyproject.toml file
                            +    # if you don't have a pyproject.toml, you can provide the metadata here instead
                            +    metadata=dict(
                            +        authors="Firstname Lastname <your.email@domain.fr>",
                            +        description="A short description of your package",
                            +    ),
                            +)
                            +
                            +

                            This will create a wheel file in the root_dir/dist folder, which you can share and install with pip

                            +

                              + + + + + + +
                              +
                              + + +
                              + +
                              + + + +
                              +
                              +
                              +
                              + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/pipes/aggregators/index.html b/main/pipes/aggregators/index.html new file mode 100644 index 00000000..593167ed --- /dev/null +++ b/main/pipes/aggregators/index.html @@ -0,0 +1,2348 @@ + + + + + + + + + + + + + + + + + + + + + + Aggregation - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                              + +
                              + + + + + + + + +
                              + + +
                              + +
                              + + + + + + +
                              +
                              + + + +
                              +
                              +
                              + + + + +
                              +
                              +
                              + + + +
                              +
                              +
                              + + + +
                              +
                              +
                              + + + +
                              +
                              + + + + + + + +

                              Aggregation

                              +

                              The aggregation step compiles extracted text blocs together according to their detected class.

                              + + + + + + + + + + + + + + + +
                              Factory nameDescription
                              simple-aggregatorReturns a dictionary with one key for each detected class
                              + +

                                + + + + + + +
                                +
                                + + +
                                + +
                                + + + +
                                +
                                +
                                +
                                + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/pipes/aggregators/simple-aggregator/index.html b/main/pipes/aggregators/simple-aggregator/index.html new file mode 100644 index 00000000..008f48a6 --- /dev/null +++ b/main/pipes/aggregators/simple-aggregator/index.html @@ -0,0 +1,2648 @@ + + + + + + + + + + + + + + + + + + + + + + Simple aggregator - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                + +
                                + + + + + + + + +
                                + + +
                                + +
                                + + + + + + +
                                +
                                + + + +
                                +
                                +
                                + + + + +
                                +
                                +
                                + + + +
                                +
                                +
                                + + + +
                                +
                                +
                                + + + +
                                +
                                + + + + + + + +
                                + + + + +
                                + + + +
                                + + + + + + +
                                + + + + +

                                +SimpleAggregator + +

                                + + +
                                + + +

                                Aggregator that returns texts and styles. It groups all text boxes with the same +label under the aggregated_text, and additionally aggregates the +styles of the text boxes.

                                +

                                Examples

                                +

                                Create a pipeline

                                +
                                +
                                +
                                +
                                pipeline = ...
                                +pipeline.add_pipe(
                                +    "simple-aggregator",
                                +    name="aggregator",
                                +    config={
                                +        "new_line_threshold": 0.2,
                                +        "new_paragraph_threshold": 1.5,
                                +        "label_map": {
                                +            "body": "text",
                                +            "table": "text",
                                +        },
                                +    },
                                +)
                                +
                                +
                                +
                                +
                                ...
                                +
                                +[components.aggregator]
                                +@factory = "simple-aggregator"
                                +new_line_threshold = 0.2
                                +new_paragraph_threshold = 1.5
                                +label_map = { body = "text", table = "text" }
                                +
                                +...
                                +
                                +
                                +
                                +
                                +

                                and run it on a document:

                                +
                                doc = pipeline(doc)
                                +print(doc.aggregated_texts)
                                +# {
                                +#     "text": "This is the body of the document, followed by a table | A | B |"
                                +# }
                                +
                                + +

                                Parameters

                                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                PARAMETERDESCRIPTION
                                pipeline +

                                The pipeline object

                                +

                                + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                +
                                name +

                                The name of the component

                                +

                                + + TYPE: + str + + + DEFAULT: + 'simple-aggregator' + +

                                +
                                sort +

                                Whether to sort text boxes inside each label group by (page, y, x) position +before merging them.

                                +

                                + + TYPE: + bool + + + DEFAULT: + False + +

                                +
                                new_line_threshold +

                                Minimum ratio of the distance between two lines to the median height of +lines to consider them as being on separate lines

                                +

                                + + TYPE: + float + + + DEFAULT: + 0.2 + +

                                +
                                new_paragraph_threshold +

                                Minimum ratio of the distance between two lines to the median height of +lines to consider them as being on separate paragraphs and thus add a +newline character between them.

                                +

                                + + TYPE: + float + + + DEFAULT: + 1.5 + +

                                +
                                label_map +

                                A dictionary mapping labels to new labels. This is useful to group labels +together, for instance, to output both "body" and "table" as "text".

                                +

                                + + TYPE: + Dict + + + DEFAULT: + {} + +

                                +
                                + + +
                                + Source code in edspdf/pipes/aggregators/simple.py +
                                84
                                +85
                                +86
                                +87
                                +88
                                +89
                                +90
                                +91
                                +92
                                +93
                                +94
                                +95
                                +96
                                +97
                                def __init__(
                                +    self,
                                +    pipeline: Pipeline = None,
                                +    name: str = "simple-aggregator",
                                +    sort: bool = False,
                                +    new_line_threshold: float = 0.2,
                                +    new_paragraph_threshold: float = 1.5,
                                +    label_map: Dict = {},
                                +) -> None:
                                +    self.name = name
                                +    self.sort = sort
                                +    self.label_map = dict(label_map)
                                +    self.new_line_threshold = new_line_threshold
                                +    self.new_paragraph_threshold = new_paragraph_threshold
                                +
                                +
                                + + + +
                                + + + + + + + + + + + +
                                + +
                                + +
                                + + + + +
                                + +
                                + +
                                +

                                  + + + + + + +
                                  +
                                  + + +
                                  + +
                                  + + + +
                                  +
                                  +
                                  +
                                  + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/pipes/box-classifiers/dummy/index.html b/main/pipes/box-classifiers/dummy/index.html new file mode 100644 index 00000000..71c7b265 --- /dev/null +++ b/main/pipes/box-classifiers/dummy/index.html @@ -0,0 +1,2473 @@ + + + + + + + + + + + + + + + + + + + + + + Dummy classifier - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                  + +
                                  + + + + + + + + +
                                  + + +
                                  + +
                                  + + + + + + +
                                  +
                                  + + + +
                                  +
                                  +
                                  + + + + +
                                  +
                                  +
                                  + + + +
                                  +
                                  +
                                  + + + +
                                  +
                                  +
                                  + + + +
                                  +
                                  + + + + + + + +

                                  Dummy classifier

                                  +
                                  + + + + +
                                  + + +

                                  Dummy classifier, for chaos purposes. Classifies each line to a random element.

                                  + +

                                  Parameters

                                  + + + + + + + + + + + + + + + + + + + + + +
                                  PARAMETERDESCRIPTION
                                  pipeline +

                                  The pipeline object.

                                  +

                                  + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                  +
                                  name +

                                  The name of the component.

                                  +

                                  + + TYPE: + str + + + DEFAULT: + 'dummy-classifier' + +

                                  +
                                  label +

                                  The label to assign to each line.

                                  +

                                  + + TYPE: + str + +

                                  +
                                  + + + + + +
                                  + + + + + + + + + + + +
                                  + +
                                  + +
                                  +

                                    + + + + + + +
                                    +
                                    + + +
                                    + +
                                    + + + +
                                    +
                                    +
                                    +
                                    + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/pipes/box-classifiers/index.html b/main/pipes/box-classifiers/index.html new file mode 100644 index 00000000..ebecd6d9 --- /dev/null +++ b/main/pipes/box-classifiers/index.html @@ -0,0 +1,2364 @@ + + + + + + + + + + + + + + + + + + + + + + Box classifiers - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                    + +
                                    + + + + + + + + +
                                    + + +
                                    + +
                                    + + + + + + +
                                    +
                                    + + + +
                                    +
                                    +
                                    + + + + +
                                    +
                                    +
                                    + + + +
                                    +
                                    +
                                    + + + +
                                    +
                                    +
                                    + + + +
                                    +
                                    + + + + + + + +

                                    Box classifiers

                                    +

                                    We developed EDS-PDF with modularity in mind. To that end, you can choose between multiple classification methods.

                                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                    Factory nameDescription
                                    mask-classifierSimple rule-based classification
                                    multi-mask-classifierSimple rule-based classification
                                    dummy-classifierDummy classifier, for testing purposes.
                                    random-classifierTo sow chaos
                                    trainable-classifierTrainable box classification model
                                    + +

                                      + + + + + + +
                                      +
                                      + + +
                                      + +
                                      + + + +
                                      +
                                      +
                                      +
                                      + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/pipes/box-classifiers/mask/index.html b/main/pipes/box-classifiers/mask/index.html new file mode 100644 index 00000000..d63d967f --- /dev/null +++ b/main/pipes/box-classifiers/mask/index.html @@ -0,0 +1,2794 @@ + + + + + + + + + + + + + + + + + + + + + + Mask Classification - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                      + +
                                      + + + + + + + + +
                                      + + +
                                      + +
                                      + + + + + + +
                                      +
                                      + + + +
                                      +
                                      +
                                      + + + + +
                                      +
                                      +
                                      + + + +
                                      +
                                      +
                                      + + + +
                                      +
                                      +
                                      + + + +
                                      +
                                      + + + + + + + +

                                      Mask Classification

                                      +

                                      We developed a simple classifier that roughly uses the same strategy as PDFBox, namely:

                                      +
                                        +
                                      • define a "mask" on the PDF documents ;
                                      • +
                                      • keep every text bloc within that mask, tag everything else as pollution.
                                      • +
                                      +

                                      Factories

                                      +

                                      Two factories are available in the classifiers registry: mask-classifier and multi-mask-classifier.

                                      +

                                      mask-classifier

                                      + + +
                                      + + + +
                                      + +

                                      The simplest form of mask classification. You define the mask, everything else +is tagged as pollution.

                                      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                      PARAMETERDESCRIPTION
                                      pipeline +

                                      The pipeline object

                                      +

                                      + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                      +
                                      name +

                                      The name of the component

                                      +

                                      + + TYPE: + str + + + DEFAULT: + 'mask-classifier' + +

                                      +
                                      x0 +

                                      The x0 coordinate of the mask

                                      +

                                      + + TYPE: + float + +

                                      +
                                      y0 +

                                      The y0 coordinate of the mask

                                      +

                                      + + TYPE: + float + +

                                      +
                                      x1 +

                                      The x1 coordinate of the mask

                                      +

                                      + + TYPE: + float + +

                                      +
                                      y1 +

                                      The y1 coordinate of the mask

                                      +

                                      + + TYPE: + float + +

                                      +
                                      threshold +

                                      The threshold for the alignment

                                      +

                                      + + TYPE: + float + + + DEFAULT: + 1.0 + +

                                      +
                                      +

                                      Examples

                                      +
                                      +
                                      +
                                      +
                                      pipeline.add_pipe(
                                      +    "mask-classifier",
                                      +    name="classifier",
                                      +    config={
                                      +        "threshold": 0.9,
                                      +        "x0": 0.1,
                                      +        "y0": 0.1,
                                      +        "x1": 0.9,
                                      +        "y1": 0.9,
                                      +    },
                                      +)
                                      +
                                      +
                                      +
                                      +
                                      [components.classifier]
                                      +@classifiers = "mask-classifier"
                                      +x0 = 0.1
                                      +y0 = 0.1
                                      +x1 = 0.9
                                      +y1 = 0.9
                                      +threshold = 0.9
                                      +
                                      +
                                      +
                                      +
                                      + +
                                      + +

                                      +

                                      multi-mask-classifier

                                      + + +
                                      + + + +
                                      + +

                                      A generalisation, wherein the user defines a number of regions.

                                      +

                                      The following configuration produces exactly the same classifier as mask.v1 +example above.

                                      +

                                      Any bloc that is not part of a mask is tagged as pollution.

                                      + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                      PARAMETERDESCRIPTION
                                      pipeline +

                                      The pipeline object

                                      +

                                      + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                      +
                                      name + +

                                      + + TYPE: + str + + + DEFAULT: + 'multi-mask-classifier' + +

                                      +
                                      threshold +

                                      The threshold for the alignment

                                      +

                                      + + TYPE: + float + + + DEFAULT: + 1.0 + +

                                      +
                                      masks +

                                      The masks

                                      +

                                      + + TYPE: + Box + + + DEFAULT: + {} + +

                                      +
                                      +

                                      Examples

                                      +
                                      +
                                      +
                                      +
                                      pipeline.add_pipe(
                                      +    "multi-mask-classifier",
                                      +    name="classifier",
                                      +    config={
                                      +        "threshold": 0.9,
                                      +        "mymask": {"x0": 0.1, "y0": 0.1, "x1": 0.9, "y1": 0.3, "label": "body"},
                                      +    },
                                      +)
                                      +
                                      +
                                      +
                                      +
                                      [components.classifier]
                                      +@factory = "multi-mask-classifier"
                                      +threshold = 0.9
                                      +
                                      +[components.classifier.mymask]
                                      +label = "body"
                                      +x0 = 0.1
                                      +y0 = 0.1
                                      +x1 = 0.9
                                      +y1 = 0.9
                                      +
                                      +
                                      +
                                      +
                                      +

                                      The following configuration defines a header region.

                                      +
                                      +
                                      +
                                      +
                                      pipeline.add_pipe(
                                      +    "multi-mask-classifier",
                                      +    name="classifier",
                                      +    config={
                                      +        "threshold": 0.9,
                                      +        "body": {"x0": 0.1, "y0": 0.1, "x1": 0.9, "y1": 0.3, "label": "header"},
                                      +        "header": {"x0": 0.1, "y0": 0.3, "x1": 0.9, "y1": 0.9, "label": "body"},
                                      +    },
                                      +)
                                      +
                                      +
                                      +
                                      +
                                      [components.classifier]
                                      +@factory = "multi-mask-classifier"
                                      +threshold = 0.9
                                      +
                                      +[components.classifier.header]
                                      +label = "header"
                                      +x0 = 0.1
                                      +y0 = 0.1
                                      +x1 = 0.9
                                      +y1 = 0.3
                                      +
                                      +[components.classifier.body]
                                      +label = "body"
                                      +x0 = 0.1
                                      +y0 = 0.3
                                      +x1 = 0.9
                                      +y1 = 0.9
                                      +
                                      +
                                      +
                                      +
                                      + +
                                      + +
                                      +

                                        + + + + + + +
                                        +
                                        + + +
                                        + +
                                        + + + +
                                        +
                                        +
                                        +
                                        + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/pipes/box-classifiers/random/index.html b/main/pipes/box-classifiers/random/index.html new file mode 100644 index 00000000..242effc0 --- /dev/null +++ b/main/pipes/box-classifiers/random/index.html @@ -0,0 +1,2471 @@ + + + + + + + + + + + + + + + + + + + + + + Random classifier - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                        + +
                                        + + + + + + + + +
                                        + + +
                                        + +
                                        + + + + + + +
                                        +
                                        + + + +
                                        +
                                        +
                                        + + + + +
                                        +
                                        +
                                        + + + +
                                        +
                                        +
                                        + + + +
                                        +
                                        +
                                        + + + +
                                        +
                                        + + + + + + + +

                                        Random classifier

                                        +
                                        + + + + +
                                        + + +

                                        Random classifier, for chaos purposes. Classifies each box to a random element.

                                        + +

                                        Parameters

                                        + + + + + + + + + + + + + + + + + + + + + +
                                        PARAMETERDESCRIPTION
                                        pipeline +

                                        The pipeline object.

                                        +

                                        + + TYPE: + Pipeline + +

                                        +
                                        name +

                                        The name of the component.

                                        +

                                        + + TYPE: + str + + + DEFAULT: + 'random-classifier' + +

                                        +
                                        labels +

                                        The labels to assign to each line. If a list is passed, each label is assigned +with equal probability. If a dict is passed, the keys are the labels and the +values are the probabilities.

                                        +

                                        + + TYPE: + Union[List[str], Dict[str, float]] + +

                                        +
                                        + + + + + +
                                        + + + + + + + + + + + +
                                        + +
                                        + +
                                        +

                                          + + + + + + +
                                          +
                                          + + +
                                          + +
                                          + + + +
                                          +
                                          +
                                          +
                                          + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/pipes/box-classifiers/trainable/index.html b/main/pipes/box-classifiers/trainable/index.html new file mode 100644 index 00000000..d11b7b9f --- /dev/null +++ b/main/pipes/box-classifiers/trainable/index.html @@ -0,0 +1,2527 @@ + + + + + + + + + + + + + + + + + + + + + + Trainable classifier - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                          + +
                                          + + + + + + + + +
                                          + + +
                                          + +
                                          + + + + + + +
                                          +
                                          + + + +
                                          +
                                          +
                                          + + + + +
                                          +
                                          +
                                          + + + +
                                          +
                                          +
                                          + + + +
                                          +
                                          +
                                          + + + +
                                          +
                                          + + + + + + + +

                                          Trainable classifier

                                          +
                                          + + + + +
                                          + + +

                                          This component predicts a label for each box over the whole document using machine +learning.

                                          +
                                          +

                                          Note

                                          +

                                          You must train the model your model to use this classifier. +See Model training for more information

                                          +
                                          +

                                          Examples

                                          +

                                          The classifier is composed of the following blocks:

                                          +
                                            +
                                          • a configurable box embedding layer
                                          • +
                                          • a linear classification layer
                                          • +
                                          +

                                          In this example, we use a box-embedding layer to generate the embeddings +of the boxes. It is composed of a text encoder that embeds the text features of the +boxes and a layout encoder that embeds the layout features of the boxes. +These two embeddings are summed and passed through an optional contextualizer, +here a box-transformer.

                                          +
                                          +
                                          +
                                          +
                                          pipeline.add_pipe(
                                          +    "trainable-classifier",
                                          +    name="classifier",
                                          +    config={
                                          +        # simple embedding computed by pooling embeddings of words in each box
                                          +        "embedding": {
                                          +            "@factory": "sub-box-cnn-pooler",
                                          +            "out_channels": 64,
                                          +            "kernel_sizes": (3, 4, 5),
                                          +            "embedding": {
                                          +                "@factory": "simple-text-embedding",
                                          +                "size": 72,
                                          +            },
                                          +        },
                                          +        "labels": ["body", "pollution"],
                                          +    },
                                          +)
                                          +
                                          +
                                          +
                                          +
                                          [components.classifier]
                                          +@factory = "trainable-classifier"
                                          +labels = ["body", "pollution"]
                                          +
                                          +[components.classifier.embedding]
                                          +@factory = "sub-box-cnn-pooler"
                                          +out_channels = 64
                                          +kernel_sizes = (3, 4, 5)
                                          +
                                          +[components.classifier.embedding.embedding]
                                          +@factory = "simple-text-embedding"
                                          +size = 72
                                          +
                                          +
                                          +
                                          +
                                          + +

                                          Parameters

                                          + + + + + + + + + + + + + + + + + +
                                          PARAMETERDESCRIPTION
                                          labels +

                                          Initial labels of the classifier (will be completed during initialization)

                                          +

                                          + + TYPE: + Sequence[str] + + + DEFAULT: + ('pollution') + +

                                          +
                                          embedding +

                                          Embedding module to encode the PDF boxes

                                          +

                                          + + TYPE: + TrainablePipe[EmbeddingOutput] + +

                                          +
                                          + + + + + +
                                          + + + + + + + + + + + +
                                          + +
                                          + +
                                          +

                                            + + + + + + +
                                            +
                                            + + +
                                            + +
                                            + + + +
                                            +
                                            +
                                            +
                                            + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/pipes/embeddings/box-layout-embedding/index.html b/main/pipes/embeddings/box-layout-embedding/index.html new file mode 100644 index 00000000..7277e14c --- /dev/null +++ b/main/pipes/embeddings/box-layout-embedding/index.html @@ -0,0 +1,2525 @@ + + + + + + + + + + + + + + + + + + + + + + BoxLayoutEmbedding - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                            + +
                                            + + + + + + + + +
                                            + + +
                                            + +
                                            + + + + + + +
                                            +
                                            + + + +
                                            +
                                            +
                                            + + + + +
                                            +
                                            +
                                            + + + +
                                            +
                                            +
                                            + + + +
                                            +
                                            +
                                            + + + +
                                            +
                                            + + + + + + + +

                                            BoxLayoutEmbedding

                                            +
                                            + + + + +
                                            + + +

                                            This component encodes the geometrical features of a box, as extracted by the +BoxLayoutPreprocessor module, into an embedding. For position modes, use:

                                            +
                                              +
                                            • "sin" to embed positions with a fixed + SinusoidalEmbedding
                                            • +
                                            • "learned" to embed positions using a learned standard pytorch embedding layer
                                            • +
                                            +

                                            Each produces embedding is the concatenation of the box width, height and the top, +left, bottom and right coordinates, each embedded depending on the *_mode param.

                                            + +

                                            Parameters

                                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                            PARAMETERDESCRIPTION
                                            size +

                                            Size of the output box embedding

                                            +

                                            + + TYPE: + int + +

                                            +
                                            n_positions +

                                            Number of position embeddings stored in the PositionEmbedding module

                                            +

                                            + + TYPE: + int + +

                                            +
                                            x_mode +

                                            Position embedding mode of the x coordinates

                                            +

                                            + + TYPE: + Literal['sin', 'learned'] + + + DEFAULT: + 'sin' + +

                                            +
                                            y_mode +

                                            Position embedding mode of the x coordinates

                                            +

                                            + + TYPE: + Literal['sin', 'learned'] + + + DEFAULT: + 'sin' + +

                                            +
                                            w_mode +

                                            Position embedding mode of the width features

                                            +

                                            + + TYPE: + Literal['sin', 'learned'] + + + DEFAULT: + 'sin' + +

                                            +
                                            h_mode +

                                            Position embedding mode of the height features

                                            +

                                            + + TYPE: + Literal['sin', 'learned'] + + + DEFAULT: + 'sin' + +

                                            +
                                            + + + + + +
                                            + + + + + + + + + + + +
                                            + +
                                            + +
                                            +

                                              + + + + + + +
                                              +
                                              + + +
                                              + +
                                              + + + +
                                              +
                                              +
                                              +
                                              + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/pipes/embeddings/box-transformer/index.html b/main/pipes/embeddings/box-transformer/index.html new file mode 100644 index 00000000..83f2354d --- /dev/null +++ b/main/pipes/embeddings/box-transformer/index.html @@ -0,0 +1,2605 @@ + + + + + + + + + + + + + + + + + + + + + + BoxTransformer - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                              + +
                                              + + + + + + + + +
                                              + + +
                                              + +
                                              + + + + + + +
                                              +
                                              + + + +
                                              +
                                              +
                                              + + + + +
                                              +
                                              +
                                              + + + +
                                              +
                                              +
                                              + + + +
                                              +
                                              +
                                              + + + +
                                              +
                                              + + + + + + + +

                                              BoxTransformer

                                              +
                                              + + + + +
                                              + + +

                                              BoxTransformer using +BoxTransformerModule +under the hood.

                                              +
                                              +

                                              Note

                                              +

                                              This module is a TrainablePipe +and can be used in a Pipeline, while +BoxTransformerModule +is a standard PyTorch module, which does not take care of the +preprocessing, collating, etc. of the input documents.

                                              +
                                              + +

                                              Parameters

                                              + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                              PARAMETERDESCRIPTION
                                              pipeline +

                                              Pipeline instance

                                              +

                                              + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                              +
                                              name +

                                              Name of the component

                                              +

                                              + + TYPE: + str + + + DEFAULT: + 'box-transformer' + +

                                              +
                                              num_heads +

                                              Number of attention heads in the attention layers

                                              +

                                              + + TYPE: + int + + + DEFAULT: + 2 + +

                                              +
                                              n_relative_positions +

                                              Maximum range of embeddable relative positions between boxes (further +distances are capped to ±n_relative_positions // 2)

                                              +

                                              + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                              +
                                              dropout_p +

                                              Dropout probability both for the attention layers and embedding projections

                                              +

                                              + + TYPE: + float + + + DEFAULT: + 0.0 + +

                                              +
                                              head_size +

                                              Head sizes of the attention layers

                                              +

                                              + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                              +
                                              activation +

                                              Activation function used in the linear->activation->linear transformations

                                              +

                                              + + TYPE: + ActivationFunction + + + DEFAULT: + 'gelu' + +

                                              +
                                              init_resweight +

                                              Initial weight of the residual gates. +At 0, the layer acts (initially) as an identity function, and at 1 as +a standard Transformer layer. +Initializing with a value close to 0 can help the training converge.

                                              +

                                              + + TYPE: + float + + + DEFAULT: + 0.0 + +

                                              +
                                              attention_mode +

                                              Mode of relative position infused attention layer. +See the relative attention +documentation for more information.

                                              +

                                              + + TYPE: + Sequence[Literal['c2c', 'c2p', 'p2c']] + + + DEFAULT: + ('c2c', 'c2p', 'p2c') + +

                                              +
                                              n_layers +

                                              Number of layers in the Transformer

                                              +

                                              + + TYPE: + int + + + DEFAULT: + 2 + +

                                              +
                                              + + + + + +
                                              + + + + + + + + + + + +
                                              + +
                                              + +
                                              +

                                                + + + + + + +
                                                +
                                                + + +
                                                + +
                                                + + + +
                                                +
                                                +
                                                +
                                                + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/pipes/embeddings/embedding-combiner/index.html b/main/pipes/embeddings/embedding-combiner/index.html new file mode 100644 index 00000000..0bdcc2f4 --- /dev/null +++ b/main/pipes/embeddings/embedding-combiner/index.html @@ -0,0 +1,2514 @@ + + + + + + + + + + + + + + + + + + + + + + EmbeddingCombiner - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                + +
                                                + + + + + + + + +
                                                + + +
                                                + +
                                                + + + + + + +
                                                +
                                                + + + +
                                                +
                                                +
                                                + + + + +
                                                +
                                                +
                                                + + + +
                                                +
                                                +
                                                + + + +
                                                +
                                                +
                                                + + + +
                                                +
                                                + + + + + + + +

                                                EmbeddingCombiner

                                                +
                                                + + + + +
                                                + + + +

                                                Encodes boxes using a combination of multiple encoders

                                                + +

                                                Parameters

                                                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                PARAMETERDESCRIPTION
                                                pipeline +

                                                The pipeline object

                                                +

                                                + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                                +
                                                name +

                                                The name of the pipe

                                                +

                                                + + TYPE: + str + + + DEFAULT: + 'embedding-combiner' + +

                                                +
                                                mode +

                                                The mode to use to combine the encoders:

                                                +
                                                  +
                                                • sum: Sum the outputs of the encoders
                                                • +
                                                • cat: Concatenate the outputs of the encoders
                                                • +
                                                +

                                                + + TYPE: + Literal['sum', 'cat'] + + + DEFAULT: + 'sum' + +

                                                +
                                                dropout_p +

                                                Dropout probability used on the output of the box and textual encoders

                                                +

                                                + + TYPE: + float + + + DEFAULT: + 0.0 + +

                                                +
                                                encoders +

                                                The encoders to use. The keys are the names of the encoders and the values +are the encoders themselves.

                                                +

                                                + + TYPE: + TrainablePipe[EmbeddingOutput] + + + DEFAULT: + {} + +

                                                +
                                                + + + + +
                                                + + + + + + + + + + + +
                                                + +
                                                + +
                                                +

                                                  + + + + + + +
                                                  +
                                                  + + +
                                                  + +
                                                  + + + +
                                                  +
                                                  +
                                                  +
                                                  + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/pipes/embeddings/huggingface-embedding/index.html b/main/pipes/embeddings/huggingface-embedding/index.html new file mode 100644 index 00000000..60f51979 --- /dev/null +++ b/main/pipes/embeddings/huggingface-embedding/index.html @@ -0,0 +1,2640 @@ + + + + + + + + + + + + + + + + + + + + + + HuggingfaceEmbedding - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                  + +
                                                  + + + + + + + + +
                                                  + + +
                                                  + +
                                                  + + + + + + +
                                                  +
                                                  + + + +
                                                  +
                                                  +
                                                  + + + + +
                                                  +
                                                  +
                                                  + + + +
                                                  +
                                                  +
                                                  + + + +
                                                  +
                                                  +
                                                  + + + +
                                                  +
                                                  + + + + + + + +

                                                  HuggingfaceEmbedding

                                                  +
                                                  + + + + +
                                                  + + +

                                                  The HuggingfaceEmbeddings component is a wrapper around the Huggingface multi-modal +models. Such pre-trained models should offer better results than a model trained +from scratch. Compared to using the raw Huggingface model, we offer a simple +mechanism to split long documents into strided windows before feeding them to the +model.

                                                  +

                                                  Windowing

                                                  +

                                                  The HuggingfaceEmbedding component splits long documents into smaller windows before +feeding them to the model. This is done to avoid hitting the maximum number of +tokens that can be processed by the model on a single device. The window size and +stride can be configured using the window and stride parameters. The default +values are 510 and 255 respectively, which means that the model will process windows +of 510 tokens, each separated by 255 tokens. Whenever a token appears in multiple +windows, the embedding of the "most contextualized" occurrence is used, i.e. the +occurrence that is the closest to the center of its window.

                                                  +

                                                  Here is an overview how this works in a classifier model : +Transformer windowing

                                                  +

                                                  Examples

                                                  +

                                                  Here is an example of how to define a pipeline with the HuggingfaceEmbedding +component:

                                                  +
                                                  from edspdf import Pipeline
                                                  +
                                                  +model = Pipeline()
                                                  +model.add_pipe(
                                                  +    "pdfminer-extractor",
                                                  +    name="extractor",
                                                  +    config={
                                                  +        "render_pages": True,
                                                  +    },
                                                  +)
                                                  +model.add_pipe(
                                                  +    "huggingface-embedding",
                                                  +    name="embedding",
                                                  +    config={
                                                  +        "model": "microsoft/layoutlmv3-base",
                                                  +        "use_image": False,
                                                  +        "window": 128,
                                                  +        "stride": 64,
                                                  +        "line_pooling": "mean",
                                                  +    },
                                                  +)
                                                  +model.add_pipe(
                                                  +    "trainable-classifier",
                                                  +    name="classifier",
                                                  +    config={
                                                  +        "embedding": model.get_pipe("embedding"),
                                                  +        "labels": [],
                                                  +    },
                                                  +)
                                                  +
                                                  +

                                                  This model can then be trained following the +training recipe.

                                                  + +

                                                  Parameters

                                                  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                  PARAMETERDESCRIPTION
                                                  pipeline +

                                                  The pipeline instance

                                                  +

                                                  + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                                  +
                                                  name +

                                                  The component name

                                                  +

                                                  + + TYPE: + str + + + DEFAULT: + 'huggingface-embedding' + +

                                                  +
                                                  model +

                                                  The Huggingface model name or path

                                                  +

                                                  + + TYPE: + str + + + DEFAULT: + None + +

                                                  +
                                                  use_image +

                                                  Whether to use the image or not in the model

                                                  +

                                                  + + TYPE: + bool + + + DEFAULT: + True + +

                                                  +
                                                  window +

                                                  The window size to use when splitting long documents into smaller windows +before feeding them to the Transformer model (default: 510 = 512 - 2)

                                                  +

                                                  + + TYPE: + int + + + DEFAULT: + 510 + +

                                                  +
                                                  stride +

                                                  The stride (distance between windows) to use when splitting long documents into +smaller windows: (default: 510 / 2 = 255)

                                                  +

                                                  + + TYPE: + int + + + DEFAULT: + 255 + +

                                                  +
                                                  line_pooling +

                                                  The pooling strategy to use when combining the embeddings of the tokens in a +line into a single line embedding

                                                  +

                                                  + + TYPE: + Literal['mean', 'max', 'sum'] + + + DEFAULT: + 'mean' + +

                                                  +
                                                  max_tokens_per_device +

                                                  The maximum number of tokens that can be processed by the model on a single +device. This does not affect the results but can be used to reduce the memory +usage of the model, at the cost of a longer processing time.

                                                  +

                                                  + + TYPE: + int + + + DEFAULT: + 128 * 128 + +

                                                  +
                                                  + + + + + +
                                                  + + + + + + + + + + + +
                                                  + +
                                                  + +
                                                  +

                                                    + + + + + + +
                                                    +
                                                    + + +
                                                    + +
                                                    + + + +
                                                    +
                                                    +
                                                    +
                                                    + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/pipes/embeddings/index.html b/main/pipes/embeddings/index.html new file mode 100644 index 00000000..e4550c24 --- /dev/null +++ b/main/pipes/embeddings/index.html @@ -0,0 +1,2381 @@ + + + + + + + + + + + + + + + + + + + + + + Embeddings - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                    + +
                                                    + + + + + + + + +
                                                    + + +
                                                    + +
                                                    + + + + + + +
                                                    +
                                                    + + + +
                                                    +
                                                    +
                                                    + + + + +
                                                    +
                                                    +
                                                    + + + +
                                                    +
                                                    +
                                                    + + + +
                                                    +
                                                    +
                                                    + + + +
                                                    +
                                                    + + + + + + + +

                                                    Embeddings

                                                    +

                                                    We offer multiple embedding methods to encode the text and layout information of the PDFs. The following components can be added to a pipeline or composed together, and contain preprocessing and postprocessing logic to convert and batch documents.

                                                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                    Factory nameDescription
                                                    simple-text-embeddingA module that embeds the textual features of the blocks.
                                                    embedding-combinerEncodes boxes using a combination of multiple encoders
                                                    sub-box-cnn-poolerPools the output of a CNN over the elements of a box (like words)
                                                    box-layout-embeddingEncodes the layout of the boxes
                                                    box-transformerContextualizes box representations using a transformer
                                                    huggingface-embeddingBox representations using a Huggingface multi-modal model.
                                                    + + +
                                                    +

                                                    Layers

                                                    +

                                                    These components are not to be confused with layers, which are standard +PyTorch modules that can be used to build trainable components, such as the ones +described here.

                                                    +
                                                    +

                                                      + + + + + + +
                                                      +
                                                      + + +
                                                      + +
                                                      + + + +
                                                      +
                                                      +
                                                      +
                                                      + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/pipes/embeddings/simple-text-embedding/index.html b/main/pipes/embeddings/simple-text-embedding/index.html new file mode 100644 index 00000000..c512dfd9 --- /dev/null +++ b/main/pipes/embeddings/simple-text-embedding/index.html @@ -0,0 +1,2474 @@ + + + + + + + + + + + + + + + + + + + + + + SimpleTextEmbedding - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                      + +
                                                      + + + + + + + + +
                                                      + + +
                                                      + +
                                                      + + + + + + +
                                                      +
                                                      + + + +
                                                      +
                                                      +
                                                      + + + + +
                                                      +
                                                      +
                                                      + + + +
                                                      +
                                                      +
                                                      + + + +
                                                      +
                                                      +
                                                      + + + +
                                                      +
                                                      + + + + + + + +

                                                      SimpleTextEmbedding

                                                      +
                                                      + + + + +
                                                      + + +

                                                      A module that embeds the textual features of the blocks

                                                      + + + +

                                                      Parameters

                                                      + + + + + + + + + + + + + + + + + + + + + +
                                                      PARAMETERDESCRIPTION
                                                      size +

                                                      Size of the output box embedding

                                                      +

                                                      + + TYPE: + int + +

                                                      +
                                                      pipeline +

                                                      The pipeline object

                                                      +

                                                      + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                                      +
                                                      name +

                                                      Name of the component

                                                      +

                                                      + + TYPE: + str + + + DEFAULT: + 'simple-text-embedding' + +

                                                      +
                                                      + + + + +
                                                      + + + + + + + + + + + +
                                                      + +
                                                      + +
                                                      +

                                                        + + + + + + +
                                                        +
                                                        + + +
                                                        + +
                                                        + + + +
                                                        +
                                                        +
                                                        +
                                                        + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/pipes/embeddings/sub-box-cnn-pooler/index.html b/main/pipes/embeddings/sub-box-cnn-pooler/index.html new file mode 100644 index 00000000..04055965 --- /dev/null +++ b/main/pipes/embeddings/sub-box-cnn-pooler/index.html @@ -0,0 +1,2530 @@ + + + + + + + + + + + + + + + + + + + + + + SubBoxCNNPooler - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                        + +
                                                        + + + + + + + + +
                                                        + + +
                                                        + +
                                                        + + + + + + +
                                                        +
                                                        + + + +
                                                        +
                                                        +
                                                        + + + + +
                                                        +
                                                        +
                                                        + + + +
                                                        +
                                                        +
                                                        + + + +
                                                        +
                                                        +
                                                        + + + +
                                                        +
                                                        + + + + + + + +

                                                        SubBoxCNNPooler

                                                        +
                                                        + + + + +
                                                        + + +

                                                        One dimension CNN encoding multi-kernel layer. +Input embeddings are convoluted using linear kernels each parametrized with +a (window) size of kernel_size[kernel_i] +The output of the kernels are concatenated together, max-pooled and finally +projected to a size of output_size.

                                                        + +

                                                        Parameters

                                                        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                        PARAMETERDESCRIPTION
                                                        pipeline +

                                                        Pipeline instance

                                                        +

                                                        + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                                        +
                                                        name +

                                                        Name of the component

                                                        +

                                                        + + TYPE: + str + + + DEFAULT: + 'sub-box-cnn-pooler' + +

                                                        +
                                                        output_size +

                                                        Size of the output embeddings +Defaults to the input_size

                                                        +

                                                        + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                                        +
                                                        out_channels +

                                                        Number of channels

                                                        +

                                                        + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                                        +
                                                        kernel_sizes +

                                                        Window size of each kernel

                                                        +

                                                        + + TYPE: + Sequence[int] + + + DEFAULT: + (3, 4, 5) + +

                                                        +
                                                        activation +

                                                        Activation function to use

                                                        +

                                                        + + TYPE: + ActivationFunction + + + DEFAULT: + 'relu' + +

                                                        +
                                                        + + + + + +
                                                        + + + + + + + + + + + +
                                                        + +
                                                        + +
                                                        +

                                                          + + + + + + +
                                                          +
                                                          + + +
                                                          + +
                                                          + + + +
                                                          +
                                                          +
                                                          +
                                                          + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/pipes/extractors/index.html b/main/pipes/extractors/index.html new file mode 100644 index 00000000..eddd128e --- /dev/null +++ b/main/pipes/extractors/index.html @@ -0,0 +1,2393 @@ + + + + + + + + + + + + + + + + + + + + + + Extraction - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                          + +
                                                          + + + + + + + + +
                                                          + + +
                                                          + +
                                                          + + + + + + +
                                                          +
                                                          + + + +
                                                          +
                                                          +
                                                          + + + + +
                                                          +
                                                          +
                                                          + + + +
                                                          +
                                                          +
                                                          + + + +
                                                          +
                                                          +
                                                          + + + +
                                                          +
                                                          + + + + + + + +

                                                          Extraction

                                                          +

                                                          The extraction phase consists of reading the PDF document and gather text blocs, along with their dimensions and position within the document. Said blocs will go on to the classification phase to separate the body from the rest.

                                                          +

                                                          Text-based PDF

                                                          +

                                                          We provide a multiple extractor architectures for text-based PDFs :

                                                          + + + + + + + + + + + + + + + + + + + + + + + +
                                                          Factory nameDescription
                                                          pdfminer-extractorExtracts text lines with the pdfminer library
                                                          mupdf-extractorExtracts text lines with the pymupdf library
                                                          poppler-extractorExtracts text lines with the poppler library
                                                          + + +

                                                          Image-based PDF

                                                          +

                                                          Image-based PDF documents require an OCR1 step, which is not natively supported by EDS-PDF. +However, you can easily extend EDS-PDF by adding such a method to the registry.

                                                          +

                                                          We plan on adding such an OCR extractor component in the future.

                                                          +
                                                          +
                                                          +
                                                            +
                                                          1. +

                                                            Optical Character Recognition, or OCR, is the process of extracting characters and words from an image. 

                                                            +
                                                          2. +
                                                          +
                                                          +

                                                            + + + + + + +
                                                            +
                                                            + + +
                                                            + +
                                                            + + + +
                                                            +
                                                            +
                                                            +
                                                            + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/pipes/extractors/pdfminer/index.html b/main/pipes/extractors/pdfminer/index.html new file mode 100644 index 00000000..995abd07 --- /dev/null +++ b/main/pipes/extractors/pdfminer/index.html @@ -0,0 +1,2653 @@ + + + + + + + + + + + + + + + + + + + + + + PdfMiner Extractor - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                            + +
                                                            + + + + + + + + +
                                                            + + +
                                                            + +
                                                            + + + + + + +
                                                            +
                                                            + + + +
                                                            +
                                                            +
                                                            + + + + +
                                                            +
                                                            +
                                                            + + + +
                                                            +
                                                            +
                                                            + + + +
                                                            +
                                                            +
                                                            + + + +
                                                            +
                                                            + + + + + + + +

                                                            PdfMiner Extractor

                                                            +
                                                            + + + + +
                                                            + + +

                                                            We provide a PDF line extractor built on top of +PdfMiner.

                                                            +

                                                            This is the most portable extractor, since it is pure-python and can therefore +be run on any platform. Be sure to have a look at their documentation, +especially the part providing a bird's eye view of the PDF extraction process.

                                                            +

                                                            Examples

                                                            +
                                                            +
                                                            +
                                                            +
                                                            pipeline.add_pipe(
                                                            +    "pdfminer-extractor",
                                                            +    config=dict(
                                                            +        extract_style=False,
                                                            +    ),
                                                            +)
                                                            +
                                                            +
                                                            +
                                                            +
                                                            [components.extractor]
                                                            +@factory = "pdfminer-extractor"
                                                            +extract_style = false
                                                            +
                                                            +
                                                            +
                                                            +
                                                            +

                                                            And use the pipeline on a PDF document:

                                                            +
                                                            from pathlib import Path
                                                            +
                                                            +# Apply on a new document
                                                            +pipeline(Path("path/to/your/pdf/document").read_bytes())
                                                            +
                                                            + +

                                                            Parameters

                                                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                            PARAMETERDESCRIPTION
                                                            line_overlap +

                                                            See PDFMiner documentation

                                                            +

                                                            + + TYPE: + float + + + DEFAULT: + 0.5 + +

                                                            +
                                                            char_margin +

                                                            See PDFMiner documentation

                                                            +

                                                            + + TYPE: + float + + + DEFAULT: + 2.05 + +

                                                            +
                                                            line_margin +

                                                            See PDFMiner documentation

                                                            +

                                                            + + TYPE: + float + + + DEFAULT: + 0.5 + +

                                                            +
                                                            word_margin +

                                                            See PDFMiner documentation

                                                            +

                                                            + + TYPE: + float + + + DEFAULT: + 0.1 + +

                                                            +
                                                            boxes_flow +

                                                            See PDFMiner documentation

                                                            +

                                                            + + TYPE: + Optional[float] + + + DEFAULT: + 0.5 + +

                                                            +
                                                            detect_vertical +

                                                            See PDFMiner documentation

                                                            +

                                                            + + TYPE: + bool + + + DEFAULT: + False + +

                                                            +
                                                            all_texts +

                                                            See PDFMiner documentation

                                                            +

                                                            + + TYPE: + bool + + + DEFAULT: + False + +

                                                            +
                                                            extract_style +

                                                            Whether to extract style (font, size, ...) information for each line of +the document. +Default: False

                                                            +

                                                            + + TYPE: + bool + + + DEFAULT: + False + +

                                                            +
                                                            render_pages +

                                                            Whether to extract the rendered page as a numpy array in the page.image +attribute (defaults to False)

                                                            +

                                                            + + TYPE: + bool + + + DEFAULT: + False + +

                                                            +
                                                            render_dpi +

                                                            DPI to use when rendering the page (defaults to 200)

                                                            +

                                                            + + TYPE: + int + + + DEFAULT: + 200 + +

                                                            +
                                                            raise_on_error +

                                                            Whether to raise an error if the PDF cannot be parsed. +Default: False

                                                            +

                                                            + + TYPE: + bool + + + DEFAULT: + False + +

                                                            +
                                                            + + + + + +
                                                            + + + + + + + + + + + +
                                                            + +
                                                            + +
                                                            +

                                                              + + + + + + +
                                                              +
                                                              + + +
                                                              + +
                                                              + + + +
                                                              +
                                                              +
                                                              +
                                                              + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/pipes/index.html b/main/pipes/index.html new file mode 100644 index 00000000..8c7fa57a --- /dev/null +++ b/main/pipes/index.html @@ -0,0 +1,2452 @@ + + + + + + + + + + + + + + + + + + + + + + Components overview - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                              + +
                                                              + + + + + + + + +
                                                              + + +
                                                              + +
                                                              + + + + + + +
                                                              +
                                                              + + + +
                                                              +
                                                              +
                                                              + + + + +
                                                              +
                                                              +
                                                              + + + +
                                                              +
                                                              +
                                                              + + + +
                                                              +
                                                              +
                                                              + + + +
                                                              +
                                                              + + + + + + + +

                                                              Components overview

                                                              +

                                                              EDS-PDF provides easy-to-use components for defining PDF processing pipelines.

                                                              +
                                                              +
                                                              +
                                                              + + + + + + + + + + + + + + + + + + + + + +
                                                              Factory nameDescription
                                                              pdfminer-extractorExtracts text lines with the pdfminer library
                                                              mupdf-extractorExtracts text lines with the pymupdf library
                                                              poppler-extractorExtracts text lines with the poppler library
                                                              +
                                                              +
                                                              + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                              Factory nameDescription
                                                              mask-classifierSimple rule-based classification
                                                              multi-mask-classifierSimple rule-based classification
                                                              dummy-classifierDummy classifier, for testing purposes.
                                                              random-classifierTo sow chaos
                                                              trainable-classifierTrainable box classification model
                                                              +
                                                              +
                                                              + + + + + + + + + + + + + +
                                                              Factory nameDescription
                                                              simple-aggregatorReturns a dictionary with one key for each detected class
                                                              +
                                                              +
                                                              +

                                                              + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                              Factory nameDescription
                                                              simple-text-embeddingA module that embeds the textual features of the blocks.
                                                              embedding-combinerEncodes boxes using a combination of multiple encoders
                                                              sub-box-cnn-poolerPools the output of a CNN over the elements of a box (like words)
                                                              box-layout-embeddingEncodes the layout of the boxes
                                                              box-transformerContextualizes box representations using a transformer
                                                              huggingface-embeddingBox representations using a Huggingface multi-modal model.
                                                              +
                                                              +
                                                              +
                                                              +

                                                              You can add them to your EDS-PDF pipeline by simply calling add_pipe, for instance:

                                                              + + +
                                                              # ↑ Omitted code that defines the pipeline object ↑
                                                              +pipeline.add_pipe("pdfminer-extractor", name="component-name", config=...)
                                                              +
                                                              +

                                                                + + + + + + +
                                                                +
                                                                + + +
                                                                + +
                                                                + + + +
                                                                +
                                                                +
                                                                +
                                                                + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/recipes/annotation/index.html b/main/recipes/annotation/index.html new file mode 100644 index 00000000..0a9a787f --- /dev/null +++ b/main/recipes/annotation/index.html @@ -0,0 +1,2581 @@ + + + + + + + + + + + + + + + + + + + + + + PDF Annotation - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                + +
                                                                + + + + + + + + +
                                                                + + +
                                                                + +
                                                                + + + + + + +
                                                                +
                                                                + + + +
                                                                +
                                                                +
                                                                + + + + +
                                                                +
                                                                +
                                                                + + + +
                                                                +
                                                                +
                                                                + + + +
                                                                +
                                                                +
                                                                + + + +
                                                                +
                                                                + + + + + + + +

                                                                PDF Annotation

                                                                +

                                                                In this section, we will cover one methodology to annotate PDF documents.

                                                                +
                                                                +

                                                                Data annotation at AP-HP's CDW

                                                                +

                                                                At AP-HP's CDW1, we recently moved away from a rule- and Java-based PDF extraction pipeline +(using PDFBox) to one using EDS-PDF. Hence, EDS-PDF is used in production, helping +extract text from around 100k PDF documents every day.

                                                                +

                                                                To train our pipeline presently in production, we annotated around 270 documents, and reached +a f1-score of 0.98 on the body classification.

                                                                +
                                                                +

                                                                Preparing the data for annotation

                                                                +

                                                                We will frame the annotation phase as an image segmentation task, +where annotators are asked to draw bounding boxes around the different sections. +Hence, the very first step is to convert PDF documents to images. We suggest using the +library pdf2image for that step.

                                                                +

                                                                The following script will convert the PDF documents located in a data/pdfs directory +to PNG images inside the data/images folder.

                                                                +
                                                                import pdf2image
                                                                +from pathlib import Path
                                                                +
                                                                +DATA_DIR = Path("data")
                                                                +PDF_DIR = DATA_DIR / "pdfs"
                                                                +IMAGE_DIR = DATA_DIR / "images"
                                                                +
                                                                +for pdf in PDF_DIR.glob("*.pdf"):
                                                                +    imgs = pdf2image.convert_from_bytes(pdf)
                                                                +
                                                                +    for page, img in enumerate(imgs):
                                                                +        path = IMAGE_DIR / f"{pdf.stem}_{page}.png"
                                                                +        img.save(path)
                                                                +
                                                                +

                                                                You can use any annotation tool to annotate the images. If you're looking for a simple +way to annotate from within a Jupyter Notebook, +ipyannotations +might be a good fit.

                                                                +

                                                                You will need to post-process the output +to convert the annotations to the following format:

                                                                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                KeyDescription
                                                                pagePage within the PDF (0-indexed)
                                                                x0Horizontal position of the top-left corner of the bounding box
                                                                x1Horizontal position of the bottom-right corner of the bounding box
                                                                y0Vertical position of the top-left corner of the bounding box
                                                                y1Vertical position of the bottom-right corner of the bounding box
                                                                labelClass of the bounding box (eg body, header...)
                                                                +

                                                                All dimensions should be normalised by the height and width of the page.

                                                                +

                                                                Saving the dataset

                                                                +

                                                                Once the annotation phase is complete, make sure the train/test split is performed +once and for all when you create the dataset.

                                                                +

                                                                We suggest the following structure:

                                                                +
                                                                Directory structure
                                                                dataset/
                                                                +├── train/
                                                                +│   ├── <note_id_1>.pdf
                                                                +│   ├── <note_id_1>.json
                                                                +│   ├── <note_id_2>.pdf
                                                                +│   ├── <note_id_2>.json
                                                                +│   └── ...
                                                                +└── test/
                                                                +    ├── <note_id_n>.pdf
                                                                +    ├── <note_id_n>.json
                                                                +    └── ...
                                                                +
                                                                +

                                                                Where the normalised annotation resides in a JSON file living next to the related PDF, +and uses the following schema:

                                                                + + + + + + + + + + + + + + + + + + + + + +
                                                                KeyDescription
                                                                note_idReference to the document
                                                                <properties>Optional property of the document itself
                                                                annotationsList of annotations, following the schema above
                                                                +

                                                                This structure presents the advantage of being machine- and human-friendly. +The JSON file contains annotated regions as well as any document property that +could be useful to adapt the pipeline (typically for the classification step).

                                                                +

                                                                Extracting annotations

                                                                +

                                                                The following snippet extracts the annotations into a workable format:

                                                                +
                                                                from pathlib import Path
                                                                +import pandas as pd
                                                                +
                                                                +
                                                                +def get_annotations(
                                                                +    directory: Path,
                                                                +) -> pd.DataFrame:
                                                                +    """
                                                                +    Read annotations from the dataset directory.
                                                                +
                                                                +    Parameters
                                                                +    ----------
                                                                +    directory : Path
                                                                +        Dataset directory
                                                                +
                                                                +    Returns
                                                                +    -------
                                                                +    pd.DataFrame
                                                                +        Pandas DataFrame containing the annotations.
                                                                +    """
                                                                +    dfs = []
                                                                +
                                                                +    iterator = tqdm(list(directory.glob("*.json")))
                                                                +
                                                                +    for path in iterator:
                                                                +        meta = json.loads(path.read_text())
                                                                +        df = pd.DataFrame.from_records(meta.pop("annotations"))
                                                                +
                                                                +        for k, v in meta.items():  # (1)
                                                                +            df[k] = v
                                                                +
                                                                +        dfs.append(df)
                                                                +
                                                                +    return pd.concat(dfs)
                                                                +
                                                                +
                                                                +train_path = Path("dataset/train")
                                                                +
                                                                +annotations = get_annotations(train_path)
                                                                +
                                                                +
                                                                  +
                                                                1. Add a column for each additional property saved in the dataset.
                                                                2. +
                                                                +

                                                                The annotations compiled this way can be used to train a pipeline. +See the trained pipeline recipe for more detail.

                                                                +
                                                                +
                                                                +
                                                                  +
                                                                1. +

                                                                  Greater Paris University Hospital's Clinical Data Warehouse 

                                                                  +
                                                                2. +
                                                                +
                                                                +

                                                                  + + + + + + +
                                                                  +
                                                                  + + +
                                                                  + +
                                                                  + + + +
                                                                  +
                                                                  +
                                                                  +
                                                                  + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/recipes/extension/index.html b/main/recipes/extension/index.html new file mode 100644 index 00000000..e00410e3 --- /dev/null +++ b/main/recipes/extension/index.html @@ -0,0 +1,2502 @@ + + + + + + + + + + + + + + + + + + + + + + Extending EDS-PDF - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                  + +
                                                                  + + + + + + + + +
                                                                  + + +
                                                                  + +
                                                                  + + + + + + +
                                                                  +
                                                                  + + + +
                                                                  +
                                                                  +
                                                                  + + + + +
                                                                  +
                                                                  +
                                                                  + + + +
                                                                  +
                                                                  +
                                                                  + + + +
                                                                  +
                                                                  +
                                                                  + + + +
                                                                  +
                                                                  + + + + + + + +

                                                                  Extending EDS-PDF

                                                                  +

                                                                  EDS-PDF is organised around a function registry powered by catalogue and a custom configuration system. The result is a powerful framework that is easy to extend - and we'll see how in this section.

                                                                  +

                                                                  For this recipe, let's imagine we're not entirely satisfied with the aggregation +proposed by EDS-PDF. For instance, we might want an aggregator that outputs the +text in Markdown format.

                                                                  +
                                                                  +

                                                                  Note

                                                                  +

                                                                  Properly converting to markdown is no easy task. For this example, +we will limit ourselves to detecting bold and italics sections.

                                                                  +
                                                                  +

                                                                  Developing the new aggregator

                                                                  +

                                                                  Our aggregator will inherit from the SimpleAggregator, +and use the style to detect italics and bold sections.

                                                                  +
                                                                  markdown_aggregator.py
                                                                  from edspdf import registry
                                                                  +from edspdf.pipes.aggregators.simple import SimpleAggregator
                                                                  +from edspdf.structures import PDFDoc, Text
                                                                  +
                                                                  +
                                                                  +@registry.factory.register("markdown-aggregator")  # (1)
                                                                  +class MarkdownAggregator(SimpleAggregator):
                                                                  +    def __call__(self, doc: PDFDoc) -> PDFDoc:
                                                                  +        doc = super().__call__(doc)
                                                                  +
                                                                  +        for label in doc.aggregated_texts.keys():
                                                                  +            text = doc.aggregated_texts[label].text
                                                                  +
                                                                  +            fragments = []
                                                                  +
                                                                  +            offset = 0
                                                                  +            for s in doc.aggregated_texts[label].properties:
                                                                  +                if s.begin >= s.end:
                                                                  +                    continue
                                                                  +                if offset < s.begin:
                                                                  +                    fragments.append(text[offset : s.begin])
                                                                  +
                                                                  +                offset = s.end
                                                                  +                snippet = text[s.begin : s.end]
                                                                  +                if s.bold:
                                                                  +                    snippet = f"**{snippet}**"
                                                                  +                if s.italic:
                                                                  +                    snippet = f"_{snippet}_"
                                                                  +                fragments.append(snippet)
                                                                  +
                                                                  +            if offset < len(text):
                                                                  +                fragments.append(text[offset:])
                                                                  +
                                                                  +            doc.aggregated_texts[label] = Text(text="".join(fragments))
                                                                  +
                                                                  +        return doc
                                                                  +
                                                                  +
                                                                    +
                                                                  1. The new aggregator is registered via this line
                                                                  2. +
                                                                  3. The new aggregator redefines the __call__ method. + It will output a single string, corresponding to the markdown-formatted output.
                                                                  4. +
                                                                  +

                                                                  That's it! You can use this new aggregator with the API:

                                                                  +
                                                                  from edspdf import Pipeline
                                                                  +from markdown_aggregator import MarkdownAggregator  # (1)
                                                                  +
                                                                  +model = Pipeline()
                                                                  +# will extract text lines from a document
                                                                  +model.add_pipe(
                                                                  +    "pdfminer-extractor",
                                                                  +    config=dict(
                                                                  +        extract_style=False,
                                                                  +    ),
                                                                  +)
                                                                  +# classify everything inside the `body` bounding box as `body`
                                                                  +model.add_pipe("mask-classifier", config={"x0": 0.1, "y0": 0.1, "x1": 0.9, "y1": 0.9})
                                                                  +# aggregates the lines together to generate the markdown formatted text
                                                                  +model.add_pipe("markdown-aggregator")
                                                                  +
                                                                  +
                                                                    +
                                                                  1. We're importing the aggregator that we just defined.
                                                                  2. +
                                                                  +

                                                                  It all works relatively smoothly!

                                                                  +

                                                                  Making the aggregator discoverable

                                                                  +

                                                                  Now, how can we instantiate the pipeline using the configuration system? +The registry needs to be aware of the new function, but we shouldn't have to +import mardown_aggregator.py just so that the module is registered as a side-effect...

                                                                  +

                                                                  Catalogue solves this problem by using Python entry points.

                                                                  +
                                                                  +
                                                                  +
                                                                  +
                                                                  [project.entry-points."edspdf_factories"]
                                                                  +"markdown-aggregator" = "markdown_aggregator:MarkdownAggregator"
                                                                  +
                                                                  +
                                                                  +
                                                                  +
                                                                  from setuptools import setup
                                                                  +
                                                                  +setup(
                                                                  +    name="edspdf-markdown-aggregator",
                                                                  +    entry_points={
                                                                  +        "edspdf_factories": [
                                                                  +            "markdown-aggregator = markdown_aggregator:MarkdownAggregator"
                                                                  +        ]
                                                                  +    },
                                                                  +)
                                                                  +
                                                                  +
                                                                  +
                                                                  +
                                                                  +

                                                                  By declaring the new aggregator as an entrypoint, it will become discoverable by EDS-PDF +as long as it is installed in your environment!

                                                                  +

                                                                    + + + + + + +
                                                                    +
                                                                    + + +
                                                                    + +
                                                                    + + + +
                                                                    +
                                                                    +
                                                                    +
                                                                    + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/recipes/index.html b/main/recipes/index.html new file mode 100644 index 00000000..c21f459e --- /dev/null +++ b/main/recipes/index.html @@ -0,0 +1,2330 @@ + + + + + + + + + + + + + + + + + + + + + + EDS-PDF Recipes - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                    + +
                                                                    + + + + + + + + +
                                                                    + + +
                                                                    + +
                                                                    + + + + + + +
                                                                    +
                                                                    + + + +
                                                                    +
                                                                    +
                                                                    + + + + +
                                                                    +
                                                                    +
                                                                    + + + +
                                                                    +
                                                                    +
                                                                    + + + +
                                                                    +
                                                                    +
                                                                    + + + +
                                                                    +
                                                                    + + + + + + + +

                                                                    EDS-PDF Recipes

                                                                    +

                                                                    This section goes over a few use-cases for PDF extraction. +It is meant as a more hands-on tutorial to get a grip on the library.

                                                                    +

                                                                      + + + + + + +
                                                                      +
                                                                      + + +
                                                                      + +
                                                                      + + + +
                                                                      +
                                                                      +
                                                                      +
                                                                      + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/recipes/resources/deep-learning-architecture.svg b/main/recipes/resources/deep-learning-architecture.svg new file mode 100644 index 00000000..147f7d90 --- /dev/null +++ b/main/recipes/resources/deep-learning-architecture.svg @@ -0,0 +1,3 @@ + + +
                                                                      X0
                                                                      X0
                                                                      Y0
                                                                      Y0
                                                                      X1
                                                                      X1
                                                                      Y1
                                                                      Y1
                                                                      W
                                                                      W
                                                                      H
                                                                      H
                                                                      CNN + MaxPooling
                                                                      CNN + MaxPooling
                                                                      2
                                                                      2
                                                                      ACR
                                                                      ACR
                                                                      classé
                                                                      classé
                                                                      Examen
                                                                      Examen
                                                                      2
                                                                      2
                                                                      acr
                                                                      acr
                                                                      classe
                                                                      classe
                                                                      examen
                                                                      examen
                                                                      d
                                                                      d
                                                                      XXX
                                                                      XXX
                                                                      xxxx
                                                                      xxxx
                                                                      Xxxxx
                                                                      Xxxxx
                                                                      2
                                                                      2
                                                                      acr
                                                                      acr
                                                                      cla
                                                                      cla
                                                                      exa
                                                                      exa
                                                                      2
                                                                      2
                                                                      acr
                                                                      acr
                                                                      sse
                                                                      sse
                                                                      men
                                                                      men
                                                                      Concatenation
                                                                      Concatenation
                                                                      ...
                                                                      ...
                                                                      Line n°i
                                                                      Line n°i
                                                                      ...
                                                                      ...
                                                                      ...
                                                                      ...
                                                                      ...
                                                                      ...

                                                                      +

                                                                      +

                                                                      +

                                                                      +

                                                                      +

                                                                      +

                                                                      +

                                                                      +
                                                                      ...
                                                                      ...
                                                                      ...
                                                                      ...
                                                                      ...
                                                                      ...
                                                                      ...
                                                                      ...
                                                                      ...
                                                                      ...
                                                                      Transformer layers (w/ relative position attention)
                                                                      Transformer layers (w/ relative position attention)
                                                                      Initial embedding
                                                                      Initial embedding
                                                                      Contextualization
                                                                      Contextualization
                                                                      Corps
                                                                      Corps
                                                                      Corps
                                                                      Corps
                                                                      Entete
                                                                      Entete
                                                                      Titre
                                                                      Titre
                                                                      Corps
                                                                      Corps
                                                                      Classification
                                                                      Classification

                                                                      +

                                                                      +
                                                                      Classification layer
                                                                      Classification la...
                                                                      Suffixes
                                                                      Suffixes
                                                                      Prefixes
                                                                      Prefixes
                                                                      Shape
                                                                      Shape
                                                                      Text is not SVG - cannot display
                                                                      diff --git a/main/recipes/resources/lines.jpeg b/main/recipes/resources/lines.jpeg new file mode 100644 index 00000000..b3afb26b Binary files /dev/null and b/main/recipes/resources/lines.jpeg differ diff --git a/main/recipes/resources/merged.jpeg b/main/recipes/resources/merged.jpeg new file mode 100644 index 00000000..c6d767d1 Binary files /dev/null and b/main/recipes/resources/merged.jpeg differ diff --git a/main/recipes/rule-based/index.html b/main/recipes/rule-based/index.html new file mode 100644 index 00000000..199a9b42 --- /dev/null +++ b/main/recipes/rule-based/index.html @@ -0,0 +1,2574 @@ + + + + + + + + + + + + + + + + + + + + + + Rule-based extraction - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                      + +
                                                                      + + + + + + + + +
                                                                      + + +
                                                                      + +
                                                                      + + + + + + +
                                                                      +
                                                                      + + + +
                                                                      +
                                                                      +
                                                                      + + + + +
                                                                      +
                                                                      +
                                                                      + + + +
                                                                      +
                                                                      +
                                                                      + + + +
                                                                      +
                                                                      +
                                                                      + + + +
                                                                      +
                                                                      + + + + + + + +

                                                                      Rule-based extraction

                                                                      +

                                                                      Let's create a rule-based extractor for PDF documents.

                                                                      +
                                                                      +

                                                                      Note

                                                                      +

                                                                      This pipeline will likely perform poorly as soon as your PDF documents +come in varied forms. In that case, even a very simple trained pipeline +may give you a substantial performance boost (see next section).

                                                                      +
                                                                      +

                                                                      First, download this example PDF.

                                                                      +

                                                                      We will use the following configuration:

                                                                      +
                                                                      config.cfg
                                                                      [pipeline]
                                                                      +components = ["extractor", "classifier", "aggregator"]
                                                                      +components_config = ${components}
                                                                      +
                                                                      +[components.extractor]
                                                                      +@factory = "pdfminer-extractor"  # (2)
                                                                      +extract_style = true
                                                                      +
                                                                      +[components.classifier]
                                                                      +@factory = "mask-classifier"  # (3)
                                                                      +x0 = 0.2
                                                                      +x1 = 0.9
                                                                      +y0 = 0.3
                                                                      +y1 = 0.6
                                                                      +threshold = 0.1
                                                                      +
                                                                      +[components.aggregator]
                                                                      +@factory = "styled-aggregator"  # (4)
                                                                      +
                                                                      +
                                                                        +
                                                                      1. This is the top-level object, which organises the entire extraction process.
                                                                      2. +
                                                                      3. Here we use the provided text-based extractor, based on the PDFMiner library
                                                                      4. +
                                                                      5. This is where we define the rule-based classifier. Here, we use a "mask", + meaning that every text bloc that falls within the boundaries will be assigned + the body label, everything else will be tagged as pollution.
                                                                      6. +
                                                                      7. This aggregator returns a tuple of dictionaries. The first contains compiled text for each + label, the second exports their style.
                                                                      8. +
                                                                      +

                                                                      Save the configuration as config.cfg and run the following snippet:

                                                                      +
                                                                      import edspdf
                                                                      +import pandas as pd
                                                                      +from pathlib import Path
                                                                      +
                                                                      +model = edspdf.load("config.cfg")  # (1)
                                                                      +
                                                                      +# Get a PDF
                                                                      +pdf = Path("/Users/perceval/Development/edspdf/tests/resources/letter.pdf").read_bytes()
                                                                      +pdf = model(pdf)
                                                                      +
                                                                      +body = pdf.aggregated_texts["body"]
                                                                      +
                                                                      +text, style = body.text, body.properties
                                                                      +print(text)
                                                                      +print(pd.DataFrame(style))
                                                                      +
                                                                      +

                                                                      This code will output the following results:

                                                                      +
                                                                      +
                                                                      +
                                                                      +

                                                                      lines

                                                                      +
                                                                      +
                                                                      +
                                                                      Cher Pr ABC, Cher DEF,
                                                                      +
                                                                      +Nous souhaitons remercier le CSE pour son avis favorable quant à l’accès aux données de
                                                                      +l’Entrepôt de Données de Santé du projet n° XXXX.
                                                                      +
                                                                      +Nous avons bien pris connaissance des conditions requises pour cet avis favorable, c’est
                                                                      +pourquoi nous nous engageons par la présente à :
                                                                      +
                                                                      +• Informer individuellement les patients concernés par la recherche, admis à l'AP-HP
                                                                      +avant juillet 2017, sortis vivants, et non réadmis depuis.
                                                                      +
                                                                      +• Effectuer une demande d'autorisation à la CNIL en cas d'appariement avec d’autres
                                                                      +cohortes.
                                                                      +
                                                                      +Bien cordialement,
                                                                      +
                                                                      +
                                                                      +
                                                                      +

                                                                      The start and end columns refer to the character indices within the extracted text.

                                                                      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                      italicboldfontnamestartend
                                                                      FalseFalseBCDFEE+Calibri022
                                                                      FalseFalseBCDFEE+Calibri2490
                                                                      FalseFalseBCDHEE+Calibri9091
                                                                      FalseFalseBCDFEE+Calibri91111
                                                                      FalseFalseBCDFEE+Calibri112113
                                                                      FalseFalseBCDHEE+Calibri113114
                                                                      FalseFalseBCDFEE+Calibri114161
                                                                      FalseFalseBCDFEE+Calibri163247
                                                                      FalseFalseBCDHEE+Calibri247248
                                                                      FalseFalseBCDFEE+Calibri248251
                                                                      FalseFalseBCDFEE+Calibri252300
                                                                      FalseFalseSymbolMT302303
                                                                      FalseFalseBCDFEE+Calibri304386
                                                                      FalseFalseBCDFEE+Calibri387445
                                                                      FalseFalseSymbolMT447448
                                                                      FalseFalseBCDFEE+Calibri449523
                                                                      FalseFalseBCDHEE+Calibri523524
                                                                      FalseFalseBCDFEE+Calibri524530
                                                                      FalseFalseBCDFEE+Calibri531540
                                                                      FalseFalseBCDFEE+Calibri542560
                                                                      +
                                                                      +
                                                                      +
                                                                      +

                                                                        + + + + + + +
                                                                        +
                                                                        + + +
                                                                        + +
                                                                        + + + +
                                                                        +
                                                                        +
                                                                        +
                                                                        + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/recipes/training/index.html b/main/recipes/training/index.html new file mode 100644 index 00000000..1a49a28d --- /dev/null +++ b/main/recipes/training/index.html @@ -0,0 +1,3207 @@ + + + + + + + + + + + + + + + + + + + + + + Training a Pipeline - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                        + +
                                                                        + + + + + + + + +
                                                                        + + +
                                                                        + +
                                                                        + + + + + + +
                                                                        +
                                                                        + + + +
                                                                        +
                                                                        +
                                                                        + + + + +
                                                                        +
                                                                        +
                                                                        + + + +
                                                                        +
                                                                        +
                                                                        + + + +
                                                                        +
                                                                        +
                                                                        + + + +
                                                                        +
                                                                        + + + + + + + +

                                                                        Training a Pipeline

                                                                        +

                                                                        In this chapter, we'll see how we can train a deep-learning based classifier to better classify the lines of the +document and extract texts from the document.

                                                                        +

                                                                        Step-by-step walkthrough

                                                                        +

                                                                        Training supervised models consists in feeding batches of samples taken from a training corpus +to a model instantiated from a given architecture and optimizing the learnable weights of the +model to decrease a given loss. The process of training a pipeline with EDS-PDF is as follows:

                                                                        +
                                                                          +
                                                                        1. +

                                                                          We first start by seeding the random states and instantiating a new trainable pipeline. Here we show two examples of pipeline, the first one based on a custom embedding architecture and the second one based on a pre-trained HuggingFace transformer model.

                                                                          +
                                                                          +
                                                                          +
                                                                          +

                                                                          The architecture of the trainable classifier of this recipe is described in the following figure: +Architecture of the trainable classifier

                                                                          +
                                                                          from edspdf import Pipeline
                                                                          +from edspdf.utils.random import set_seed
                                                                          +
                                                                          +set_seed(42)
                                                                          +
                                                                          +model = Pipeline()
                                                                          +model.add_pipe("pdfminer-extractor", name="extractor") # (1)
                                                                          +model.add_pipe(
                                                                          +    "box-transformer",
                                                                          +    name="embedding",
                                                                          +    config={
                                                                          +        "num_heads": 4,
                                                                          +        "dropout_p": 0.1,
                                                                          +        "activation": "gelu",
                                                                          +        "init_resweight": 0.01,
                                                                          +        "head_size": 16,
                                                                          +        "attention_mode": ["c2c", "c2p", "p2c"],
                                                                          +        "n_layers": 1,
                                                                          +        "n_relative_positions": 64,
                                                                          +        "embedding": {
                                                                          +            "@factory": "embedding-combiner",
                                                                          +            "dropout_p": 0.1,
                                                                          +            "text_encoder": {
                                                                          +                "@factory": "sub-box-cnn-pooler",
                                                                          +                "out_channels": 64,
                                                                          +                "kernel_sizes": (3, 4, 5),
                                                                          +                "embedding": {
                                                                          +                    "@factory": "simple-text-embedding",
                                                                          +                    "size": 72,
                                                                          +                },
                                                                          +            },
                                                                          +            "layout_encoder": {
                                                                          +                "@factory": "box-layout-embedding",
                                                                          +                "n_positions": 64,
                                                                          +                "x_mode": "learned",
                                                                          +                "y_mode": "learned",
                                                                          +                "w_mode": "learned",
                                                                          +                "h_mode": "learned",
                                                                          +                "size": 72,
                                                                          +            },
                                                                          +        },
                                                                          +    },
                                                                          +)
                                                                          +model.add_pipe(
                                                                          +    "trainable-classifier",
                                                                          +    name="classifier",
                                                                          +    config={
                                                                          +        "embedding": model.get_pipe("embedding"),
                                                                          +        "labels": [],
                                                                          +    },
                                                                          +)
                                                                          +
                                                                          +
                                                                            +
                                                                          1. You can choose between multiple extractors, such as "pdfminer-extractor", "mupdf-extractor" or "poppler-extractor" (the latter does not support rendering images). See the extractors list here extractors for more details.
                                                                          2. +
                                                                          +
                                                                          +
                                                                          +
                                                                          model = Pipeline()
                                                                          +model.add_pipe(
                                                                          +    "mupdf-extractor",
                                                                          +    name="extractor",
                                                                          +    config={
                                                                          +        "render_pages": True,
                                                                          +    },
                                                                          +) # (1)
                                                                          +model.add_pipe(
                                                                          +    "huggingface-embedding",
                                                                          +    name="embedding",
                                                                          +    config={
                                                                          +        "model": "microsoft/layoutlmv3-base",
                                                                          +        "use_image": False,
                                                                          +        "window": 128,
                                                                          +        "stride": 64,
                                                                          +        "line_pooling": "mean",
                                                                          +    },
                                                                          +)
                                                                          +model.add_pipe(
                                                                          +    "trainable-classifier",
                                                                          +    name="classifier",
                                                                          +    config={
                                                                          +        "embedding": model.get_pipe("embedding"),
                                                                          +        "labels": [],
                                                                          +    },
                                                                          +)
                                                                          +
                                                                          +
                                                                            +
                                                                          1. You can choose between multiple extractors, such as "pdfminer-extractor", "mupdf-extractor" or "poppler-extractor" (the latter does not support rendering images). See the extractors list here extractors for more details.
                                                                          2. +
                                                                          +
                                                                          +
                                                                          +
                                                                          +
                                                                        2. +
                                                                        3. +

                                                                          We then load and adapt (i.e., convert into PDFDoc) the training and validation dataset, which is often a combination of JSON and PDF files. The recommended way of doing this is to make a Python generator of PDFDoc objects. +

                                                                          train_docs = list(segmentation_adapter(train_path)(model))
                                                                          +val_docs = list(segmentation_adapter(val_path)(model))
                                                                          +

                                                                          +
                                                                        4. +
                                                                        5. +

                                                                          We initialize the missing or incomplete components attributes (such as vocabularies) with the training dataset +

                                                                          model.post_init(train_docs)
                                                                          +

                                                                          +
                                                                        6. +
                                                                        7. +

                                                                          The training dataset is then preprocessed into features. The resulting preprocessed dataset is then wrapped into a pytorch DataLoader to be fed to the model during the training loop with the model's own collate method. +

                                                                          preprocessed = list(model.preprocess_many(train_docs, supervision=True))
                                                                          +dataloader = DataLoader(
                                                                          +    preprocessed,
                                                                          +    batch_size=batch_size,
                                                                          +    collate_fn=model.collate,
                                                                          +    shuffle=True,
                                                                          +)
                                                                          +

                                                                          +
                                                                        8. +
                                                                        9. +

                                                                          We instantiate an optimizer and start the training loop +

                                                                          from itertools import chain, repeat
                                                                          +
                                                                          +optimizer = torch.optim.AdamW(
                                                                          +    params=model.parameters(),
                                                                          +    lr=lr,
                                                                          +)
                                                                          +
                                                                          +# We will loop over the dataloader
                                                                          +iterator = chain.from_iterable(repeat(dataloader))
                                                                          +
                                                                          +for step in tqdm(range(max_steps), "Training model", leave=True):
                                                                          +    batch = next(iterator)
                                                                          +    optimizer.zero_grad()
                                                                          +

                                                                          +
                                                                        10. +
                                                                        11. +

                                                                          The trainable components are fed the collated batches from the dataloader with the TrainablePipe.module_forward methods to compute the losses. Since outputs of shared subcomponents are reused between components, we enable caching by wrapping this step in a cache context. The training loop is otherwise carried in a similar fashion to a standard pytorch training loop +

                                                                          with model.cache():
                                                                          +    loss = torch.zeros((), device="cpu")
                                                                          +    for name, component in model.trainable_pipes():
                                                                          +        output = component.module_forward(batch[component.name])
                                                                          +        if "loss" in output:
                                                                          +            loss += output["loss"]
                                                                          +
                                                                          +    loss.backward()
                                                                          +
                                                                          +    optimizer.step()
                                                                          +

                                                                          +
                                                                        12. +
                                                                        13. +

                                                                          Finally, the model is evaluated on the validation dataset at regular intervals and saved at the end of the training. To score the model, we only want to run "classifier" component and not the extractor, otherwise we would overwrite annotated text boxes on documents in the val_docs dataset, and have mismatching text boxes between the gold and predicted documents. To save the model, although you can use torch.save to save your model, we provide a safer method to avoid the security pitfalls of pickle models +

                                                                          from edspdf import Pipeline
                                                                          +from sklearn.metrics import classification_report
                                                                          +from copy import deepcopy
                                                                          +
                                                                          +
                                                                          +def score(golds, preds):
                                                                          +    return classification_report(
                                                                          +        [b.label for gold in golds for b in gold.text_boxes if b.text != ""],
                                                                          +        [b.label for pred in preds for b in pred.text_boxes if b.text != ""],
                                                                          +        output_dict=True,
                                                                          +        zero_division=0,
                                                                          +    )
                                                                          +
                                                                          +
                                                                          +...
                                                                          +
                                                                          +if (step % 100) == 0:
                                                                          +    # we only want to run "classifier" component, not overwrite the text boxes
                                                                          +    with model.select_pipes(enable=["classifier"]):
                                                                          +        print(score(val_docs, model.pipe(deepcopy(val_docs))))
                                                                          +
                                                                          +# torch.save(model, "model.pt")
                                                                          +model.save("model")
                                                                          +

                                                                          +
                                                                        14. +
                                                                        +

                                                                        Adapting a dataset

                                                                        +

                                                                        The first step of training a pipeline is to adapt the dataset to the pipeline. This is done by converting the dataset into a list of PDFDoc objects, using an extractor. The following function loads a dataset of .pdf and .json files, where each .json file contain box annotations represented with page, x0, x1, y0, y1 and label.

                                                                        +
                                                                        from edspdf.utils.alignment import align_box_labels
                                                                        +from pathlib import Path
                                                                        +from pydantic import DirectoryPath
                                                                        +from edspdf.registry import registry
                                                                        +from edspdf.structures import Box
                                                                        +import json
                                                                        +
                                                                        +
                                                                        +@registry.adapter.register("my-segmentation-adapter")
                                                                        +def segmentation_adapter(
                                                                        +    path: DirectoryPath,
                                                                        +):
                                                                        +    def adapt_to(model):
                                                                        +        for anns_filepath in sorted(Path(path).glob("*.json")):
                                                                        +            pdf_filepath = str(anns_filepath).replace(".json", ".pdf")
                                                                        +            with open(anns_filepath) as f:
                                                                        +                sample = json.load(f)
                                                                        +            pdf = Path(pdf_filepath).read_bytes()
                                                                        +
                                                                        +            if len(sample["annotations"]) == 0:
                                                                        +                continue
                                                                        +
                                                                        +            doc = model.components.extractor(pdf)
                                                                        +            doc.id = pdf_filepath.split(".")[0].split("/")[-1]
                                                                        +            doc.lines = [
                                                                        +                line
                                                                        +                for page in sorted(set(b.page for b in doc.lines))
                                                                        +                for line in align_box_labels(
                                                                        +                    src_boxes=[
                                                                        +                        Box(
                                                                        +                            page_num=b["page"],
                                                                        +                            x0=b["x0"],
                                                                        +                            x1=b["x1"],
                                                                        +                            y0=b["y0"],
                                                                        +                            y1=b["y1"],
                                                                        +                            label=b["label"],
                                                                        +                        )
                                                                        +                        for b in sample["annotations"]
                                                                        +                        if b["page"] == page
                                                                        +                    ],
                                                                        +                    dst_boxes=doc.lines,
                                                                        +                    pollution_label=None,
                                                                        +                )
                                                                        +                if line.text == "" or line.label is not None
                                                                        +            ]
                                                                        +            yield doc
                                                                        +
                                                                        +    return adapt_to
                                                                        +
                                                                        +

                                                                        Full example

                                                                        +

                                                                        Let's wrap the training code in a function, and make it callable from the command line using confit !

                                                                        +
                                                                        +train.py +
                                                                          1
                                                                        +  2
                                                                        +  3
                                                                        +  4
                                                                        +  5
                                                                        +  6
                                                                        +  7
                                                                        +  8
                                                                        +  9
                                                                        + 10
                                                                        + 11
                                                                        + 12
                                                                        + 13
                                                                        + 14
                                                                        + 15
                                                                        + 16
                                                                        + 17
                                                                        + 18
                                                                        + 19
                                                                        + 20
                                                                        + 21
                                                                        + 22
                                                                        + 23
                                                                        + 24
                                                                        + 25
                                                                        + 26
                                                                        + 27
                                                                        + 28
                                                                        + 29
                                                                        + 30
                                                                        + 31
                                                                        + 32
                                                                        + 33
                                                                        + 34
                                                                        + 35
                                                                        + 36
                                                                        + 37
                                                                        + 38
                                                                        + 39
                                                                        + 40
                                                                        + 41
                                                                        + 42
                                                                        + 43
                                                                        + 44
                                                                        + 45
                                                                        + 46
                                                                        + 47
                                                                        + 48
                                                                        + 49
                                                                        + 50
                                                                        + 51
                                                                        + 52
                                                                        + 53
                                                                        + 54
                                                                        + 55
                                                                        + 56
                                                                        + 57
                                                                        + 58
                                                                        + 59
                                                                        + 60
                                                                        + 61
                                                                        + 62
                                                                        + 63
                                                                        + 64
                                                                        + 65
                                                                        + 66
                                                                        + 67
                                                                        + 68
                                                                        + 69
                                                                        + 70
                                                                        + 71
                                                                        + 72
                                                                        + 73
                                                                        + 74
                                                                        + 75
                                                                        + 76
                                                                        + 77
                                                                        + 78
                                                                        + 79
                                                                        + 80
                                                                        + 81
                                                                        + 82
                                                                        + 83
                                                                        + 84
                                                                        + 85
                                                                        + 86
                                                                        + 87
                                                                        + 88
                                                                        + 89
                                                                        + 90
                                                                        + 91
                                                                        + 92
                                                                        + 93
                                                                        + 94
                                                                        + 95
                                                                        + 96
                                                                        + 97
                                                                        + 98
                                                                        + 99
                                                                        +100
                                                                        +101
                                                                        +102
                                                                        +103
                                                                        +104
                                                                        +105
                                                                        +106
                                                                        +107
                                                                        +108
                                                                        +109
                                                                        +110
                                                                        +111
                                                                        +112
                                                                        +113
                                                                        +114
                                                                        +115
                                                                        +116
                                                                        +117
                                                                        +118
                                                                        +119
                                                                        +120
                                                                        +121
                                                                        +122
                                                                        +123
                                                                        +124
                                                                        +125
                                                                        +126
                                                                        +127
                                                                        +128
                                                                        +129
                                                                        +130
                                                                        +131
                                                                        +132
                                                                        +133
                                                                        +134
                                                                        +135
                                                                        +136
                                                                        +137
                                                                        +138
                                                                        +139
                                                                        +140
                                                                        +141
                                                                        +142
                                                                        +143
                                                                        +144
                                                                        +145
                                                                        +146
                                                                        +147
                                                                        +148
                                                                        +149
                                                                        +150
                                                                        +151
                                                                        +152
                                                                        +153
                                                                        +154
                                                                        +155
                                                                        +156
                                                                        +157
                                                                        +158
                                                                        +159
                                                                        +160
                                                                        +161
                                                                        +162
                                                                        +163
                                                                        +164
                                                                        +165
                                                                        +166
                                                                        +167
                                                                        +168
                                                                        +169
                                                                        +170
                                                                        +171
                                                                        +172
                                                                        +173
                                                                        +174
                                                                        +175
                                                                        +176
                                                                        +177
                                                                        import itertools
                                                                        +import json
                                                                        +from copy import deepcopy
                                                                        +from pathlib import Path
                                                                        +
                                                                        +import torch
                                                                        +from confit import Cli
                                                                        +from pydantic import DirectoryPath
                                                                        +from torch.utils.data import DataLoader
                                                                        +from tqdm import tqdm
                                                                        +
                                                                        +from edspdf import Pipeline, registry
                                                                        +from edspdf.structures import Box
                                                                        +from edspdf.utils.alignment import align_box_labels
                                                                        +from edspdf.utils.random import set_seed
                                                                        +
                                                                        +app = Cli(pretty_exceptions_show_locals=False)
                                                                        +
                                                                        +
                                                                        +def score(golds, preds):
                                                                        +    return classification_report(
                                                                        +        [b.label for gold in golds for b in gold.text_boxes if b.text != ""],
                                                                        +        [b.label for pred in preds for b in pred.text_boxes if b.text != ""],
                                                                        +        output_dict=True,
                                                                        +        zero_division=0,
                                                                        +    )
                                                                        +
                                                                        +
                                                                        +@registry.adapter.register("my-segmentation-adapter")
                                                                        +def segmentation_adapter(
                                                                        +    path: str,
                                                                        +):
                                                                        +    def adapt_to(model):
                                                                        +        for anns_filepath in sorted(Path(path).glob("*.json")):
                                                                        +            pdf_filepath = str(anns_filepath).replace(".json", ".pdf")
                                                                        +            with open(anns_filepath) as f:
                                                                        +                sample = json.load(f)
                                                                        +            pdf = Path(pdf_filepath).read_bytes()
                                                                        +
                                                                        +            if len(sample["annotations"]) == 0:
                                                                        +                continue
                                                                        +
                                                                        +            doc = model.get_pipe("extractor")(pdf)
                                                                        +            doc.id = pdf_filepath.split(".")[0].split("/")[-1]
                                                                        +            doc.content_boxes = [
                                                                        +                line
                                                                        +                for page_num in sorted(set(b.page_num for b in doc.lines))
                                                                        +                for line in align_box_labels(
                                                                        +                    src_boxes=[
                                                                        +                        Box(
                                                                        +                            page_num=b["page"],
                                                                        +                            x0=b["x0"],
                                                                        +                            x1=b["x1"],
                                                                        +                            y0=b["y0"],
                                                                        +                            y1=b["y1"],
                                                                        +                            label=b["label"],
                                                                        +                        )
                                                                        +                        for b in sample["annotations"]
                                                                        +                        if b["page"] == page_num
                                                                        +                    ],
                                                                        +                    dst_boxes=doc.lines,
                                                                        +                    pollution_label=None,
                                                                        +                )
                                                                        +                if line.text == "" or line.label is not None
                                                                        +            ]
                                                                        +            yield doc
                                                                        +
                                                                        +    return adapt_to
                                                                        +
                                                                        +
                                                                        +@app.command(name="train")
                                                                        +def train_my_model(
                                                                        +    train_path: DirectoryPath = "dataset/train",
                                                                        +    val_path: DirectoryPath = "dataset/dev",
                                                                        +    max_steps: int = 1000,
                                                                        +    batch_size: int = 4,
                                                                        +    lr: float = 3e-4,
                                                                        +):
                                                                        +    set_seed(42)
                                                                        +
                                                                        +    # We define the model
                                                                        +    model = Pipeline()
                                                                        +    model.add_pipe("mupdf-extractor", name="extractor")
                                                                        +    model.add_pipe(
                                                                        +        "box-transformer",
                                                                        +        name="embedding",
                                                                        +        config={
                                                                        +            "num_heads": 4,
                                                                        +            "dropout_p": 0.1,
                                                                        +            "activation": "gelu",
                                                                        +            "init_resweight": 0.01,
                                                                        +            "head_size": 16,
                                                                        +            "attention_mode": ["c2c", "c2p", "p2c"],
                                                                        +            "n_layers": 1,
                                                                        +            "n_relative_positions": 64,
                                                                        +            "embedding": {
                                                                        +                "@factory": "embedding-combiner",
                                                                        +                "dropout_p": 0.1,
                                                                        +                "text_encoder": {
                                                                        +                    "@factory": "sub-box-cnn-pooler",
                                                                        +                    "out_channels": 64,
                                                                        +                    "kernel_sizes": (3, 4, 5),
                                                                        +                    "embedding": {
                                                                        +                        "@factory": "simple-text-embedding",
                                                                        +                        "size": 72,
                                                                        +                    },
                                                                        +                },
                                                                        +                "layout_encoder": {
                                                                        +                    "@factory": "box-layout-embedding",
                                                                        +                    "n_positions": 64,
                                                                        +                    "x_mode": "learned",
                                                                        +                    "y_mode": "learned",
                                                                        +                    "w_mode": "learned",
                                                                        +                    "h_mode": "learned",
                                                                        +                    "size": 72,
                                                                        +                },
                                                                        +            },
                                                                        +        },
                                                                        +    )
                                                                        +    model.add_pipe(
                                                                        +        "trainable-classifier",
                                                                        +        name="classifier",
                                                                        +        config={
                                                                        +            "embedding": model.get_pipe("embedding"),
                                                                        +            "labels": [],
                                                                        +        },
                                                                        +    )
                                                                        +
                                                                        +    # Loading and adapting the training and validation data
                                                                        +    train_docs = list(segmentation_adapter(train_path)(model))
                                                                        +    val_docs = list(segmentation_adapter(val_path)(model))
                                                                        +
                                                                        +    # Taking the first `initialization_subset` samples to initialize the model
                                                                        +    model.post_init(train_docs)
                                                                        +
                                                                        +    # Preprocessing the training dataset into a dataloader
                                                                        +    preprocessed = list(model.preprocess_many(train_docs, supervision=True))
                                                                        +    dataloader = DataLoader(
                                                                        +        preprocessed,
                                                                        +        batch_size=batch_size,
                                                                        +        collate_fn=model.collate,
                                                                        +        shuffle=True,
                                                                        +    )
                                                                        +
                                                                        +    optimizer = torch.optim.AdamW(
                                                                        +        params=model.parameters(),
                                                                        +        lr=lr,
                                                                        +    )
                                                                        +
                                                                        +    # We will loop over the dataloader
                                                                        +    iterator = itertools.chain.from_iterable(itertools.repeat(dataloader))
                                                                        +
                                                                        +    for step in tqdm(range(max_steps), "Training model", leave=True):
                                                                        +        batch = next(iterator)
                                                                        +        optimizer.zero_grad()
                                                                        +
                                                                        +        with model.cache():
                                                                        +            loss = torch.zeros((), device="cpu")
                                                                        +            for name, component in model.trainable_pipes():
                                                                        +                output = component.module_forward(batch[component.name])
                                                                        +                if "loss" in output:
                                                                        +                    loss += output["loss"]
                                                                        +
                                                                        +            loss.backward()
                                                                        +
                                                                        +            optimizer.step()
                                                                        +
                                                                        +        if (step % 100) == 0:
                                                                        +            with model.select_pipes(enable=["classifier"]):
                                                                        +                print(score(val_docs, model.pipe(deepcopy(val_docs))))
                                                                        +            model.save("model")
                                                                        +
                                                                        +    return model
                                                                        +
                                                                        +
                                                                        +if __name__ == "__main__":
                                                                        +    app()
                                                                        +
                                                                        +
                                                                        +
                                                                        python train.py --seed 42
                                                                        +
                                                                        +

                                                                        At the end of the training, the pipeline is ready to use (with the .pipe method) since every trained component of the pipeline is self-sufficient, ie contains the preprocessing, inference and postprocessing code required to run it.

                                                                        +

                                                                        Configuration

                                                                        +

                                                                        To decouple the configuration and the code of our training script, let's define a configuration file where we will describe both our training parameters and the pipeline. You can either write the config of the pipeline by hand, or generate it from an instantiated pipeline by running:

                                                                        +
                                                                        print(pipeline.config.to_str())
                                                                        +
                                                                        +
                                                                        +
                                                                        +
                                                                        +
                                                                        config.cfg
                                                                        # This is this equivalent of the API-based declaration at the beginning of the tutorial
                                                                        +[pipeline]
                                                                        +pipeline = ["extractor", "embedding", "classifier"]
                                                                        +disabled = []
                                                                        +components = ${components}
                                                                        +
                                                                        +[components]
                                                                        +
                                                                        +[components.extractor]
                                                                        +@factory = "pdfminer-extractor"
                                                                        +
                                                                        +[components.embedding]
                                                                        +@factory = "box-transformer"
                                                                        +num_heads = 4
                                                                        +dropout_p = 0.1
                                                                        +activation = "gelu"
                                                                        +init_resweight = 0.01
                                                                        +head_size = 16
                                                                        +attention_mode = ["c2c", "c2p", "p2c"]
                                                                        +n_layers = 1
                                                                        +n_relative_positions = 64
                                                                        +
                                                                        +[components.embedding.embedding]
                                                                        +@factory = "embedding-combiner"
                                                                        +dropout_p = 0.1
                                                                        +
                                                                        +[components.embedding.embedding.text_encoder]
                                                                        +@factory = "sub-box-cnn-pooler"
                                                                        +out_channels = 64
                                                                        +kernel_sizes = (3, 4, 5)
                                                                        +
                                                                        +[components.embedding.embedding.text_encoder.embedding]
                                                                        +@factory = "simple-text-embedding"
                                                                        +size = 72
                                                                        +
                                                                        +[components.embedding.embedding.layout_encoder]
                                                                        +@factory = "box-layout-embedding"
                                                                        +n_positions = 64
                                                                        +x_mode = "learned"
                                                                        +y_mode = "learned"
                                                                        +w_mode = "learned"
                                                                        +h_mode = "learned"
                                                                        +size = 72
                                                                        +
                                                                        +[components.classifier]
                                                                        +@factory = "trainable-classifier"
                                                                        +embedding = ${components.embedding}
                                                                        +labels = []
                                                                        +
                                                                        +# This is were we define the training script parameters
                                                                        +# the "train" section refers to the name of the command in the training script
                                                                        +[train]
                                                                        +model = ${pipeline}
                                                                        +train_data = {"@adapter": "my-segmentation-adapter", "path": "data/train"}
                                                                        +val_data = {"@adapter": "my-segmentation-adapter", "path": "data/val"}
                                                                        +max_steps = 1000
                                                                        +seed = 42
                                                                        +lr = 3e-4
                                                                        +batch_size = 4
                                                                        +
                                                                        +
                                                                        +
                                                                        +
                                                                        config.cfg
                                                                        [pipeline]
                                                                        +pipeline = ["extractor", "embedding", "classifier"]
                                                                        +disabled = []
                                                                        +components = ${components}
                                                                        +
                                                                        +[components]
                                                                        +
                                                                        +[components.extractor]
                                                                        +@factory = "mupdf-extractor"
                                                                        +render_pages = true
                                                                        +
                                                                        +[components.embedding]
                                                                        +@factory = "huggingface-embedding"
                                                                        +model = "microsoft/layoutlmv3-base"
                                                                        +use_image = false
                                                                        +window = 128
                                                                        +stride = 64
                                                                        +line_pooling = "mean"
                                                                        +
                                                                        +[components.classifier]
                                                                        +@factory = "trainable-classifier"
                                                                        +embedding = ${components.embedding}
                                                                        +labels = []
                                                                        +
                                                                        +[train]
                                                                        +model = ${pipeline}
                                                                        +max_steps = 1000
                                                                        +lr = 5e-5
                                                                        +seed = 42
                                                                        +train_data = {"@adapter": "my-segmentation-adapter", "path": "data/train"}
                                                                        +val_data = {"@adapter": "my-segmentation-adapter", "path": "data/val"}
                                                                        +batch_size = 8
                                                                        +
                                                                        +
                                                                        +
                                                                        +
                                                                        +

                                                                        and update our training script to use the pipeline and the data adapters defined in the configuration file instead of the Python declaration :

                                                                        +
                                                                        @app.command(name="train")
                                                                        +def train_my_model(
                                                                        ++   model: Pipeline,
                                                                        ++   train_path: DirectoryPath = "data/train",
                                                                        +-   train_data: Callable = segmentation_adapter("data/train"),
                                                                        ++   val_path: DirectoryPath = "data/val",
                                                                        +-   val_data: Callable = segmentation_adapter("data/val"),
                                                                        +    seed: int = 42,
                                                                        +    max_steps: int = 1000,
                                                                        +    batch_size: int = 4,
                                                                        +    lr: float = 3e-4,
                                                                        +):
                                                                        +    # Seed will be set by the CLI util, before `model` is instanciated
                                                                        +-   set_seed(seed)
                                                                        +
                                                                        +    # Model will be defined from the config file using registries
                                                                        +-   model = Pipeline()
                                                                        +-   model.add_pipe("mupdf-extractor", name="extractor")
                                                                        +-   model.add_pipe(
                                                                        +-       "box-transformer",
                                                                        +-       name="embedding",
                                                                        +-       config={
                                                                        +-           "num_heads": 4,
                                                                        +-           "dropout_p": 0.1,
                                                                        +-           "activation": "gelu",
                                                                        +-           "init_resweight": 0.01,
                                                                        +-           "head_size": 16,
                                                                        +-           "attention_mode": ["c2c", "c2p", "p2c"],
                                                                        +-           "n_layers": 1,
                                                                        +-           "n_relative_positions": 64,
                                                                        +-           "embedding": {
                                                                        +-               "@factory": "embedding-combiner",
                                                                        +-               "dropout_p": 0.1,
                                                                        +-               "text_encoder": {
                                                                        +-                   "@factory": "sub-box-cnn-pooler",
                                                                        +-                   "out_channels": 64,
                                                                        +-                   "kernel_sizes": (3, 4, 5),
                                                                        +-                   "embedding": {
                                                                        +-                       "@factory": "simple-text-embedding",
                                                                        +-                       "size": 72,
                                                                        +-                   },
                                                                        +-               },
                                                                        +-               "layout_encoder": {
                                                                        +-                   "@factory": "box-layout-embedding",
                                                                        +-                   "n_positions": 64,
                                                                        +-                   "x_mode": "learned",
                                                                        +-                   "y_mode": "learned",
                                                                        +-                   "w_mode": "learned",
                                                                        +-                   "h_mode": "learned",
                                                                        +-                   "size": 72,
                                                                        +-               },
                                                                        +-           },
                                                                        +-       },
                                                                        +-   )
                                                                        +-   model.add_pipe(
                                                                        +-       "trainable-classifier",
                                                                        +-       name="classifier",
                                                                        +-       config={
                                                                        +-           "embedding": model.get_pipe("embedding"),
                                                                        +-           "labels": [],
                                                                        +-       },
                                                                        +-   )
                                                                        +
                                                                        +    # Loading and adapting the training and validation data
                                                                        +-    train_docs = list(segmentation_adapter(train_path)(model))
                                                                        ++    train_docs = list(train_data(model))
                                                                        +-    val_docs = list(segmentation_adapter(val_path)(model))
                                                                        ++    val_docs = list(val_data(model))
                                                                        +
                                                                        +    # Taking the first `initialization_subset` samples to initialize the model
                                                                        +    ...
                                                                        +
                                                                        +

                                                                        That's it ! We can now call the training script with the configuration file as a parameter, and override some of its defaults values:

                                                                        +
                                                                        python train.py --config config.cfg --components.extractor.extract_styles=true --seed 43
                                                                        +
                                                                        +

                                                                          + + + + + + +
                                                                          +
                                                                          + + +
                                                                          + +
                                                                          + + + +
                                                                          +
                                                                          +
                                                                          +
                                                                          + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/accelerators/base/index.html b/main/reference/edspdf/accelerators/base/index.html new file mode 100644 index 00000000..6d8a6b80 --- /dev/null +++ b/main/reference/edspdf/accelerators/base/index.html @@ -0,0 +1,2455 @@ + + + + + + + + + + + + + + + + + + + + + + base - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                          + +
                                                                          + + + + + + + + +
                                                                          + + +
                                                                          + +
                                                                          + + + + + + +
                                                                          +
                                                                          + + + +
                                                                          +
                                                                          +
                                                                          + + + + +
                                                                          +
                                                                          +
                                                                          + + + +
                                                                          +
                                                                          +
                                                                          + + + +
                                                                          +
                                                                          +
                                                                          + + + +
                                                                          +
                                                                          + + + + + + + +

                                                                          edspdf.accelerators.base

                                                                          + + +
                                                                          + + + + +
                                                                          + + + +
                                                                          + + + + + + +
                                                                          + + + + +

                                                                          + FromDoc + + +

                                                                          + + +
                                                                          + + +

                                                                          A FromDoc converter (from a PDFDoc to an arbitrary type) can be either:

                                                                          +
                                                                            +
                                                                          • a dict mapping field names to doc attributes
                                                                          • +
                                                                          • a callable that takes a PDFDoc and returns an arbitrary type
                                                                          • +
                                                                          + + + + + +
                                                                          + + + + + + + + + + + +
                                                                          + +
                                                                          + +
                                                                          + + + + +
                                                                          + +
                                                                          + +
                                                                          +

                                                                            + + + + + + +
                                                                            +
                                                                            + + +
                                                                            + +
                                                                            + + + +
                                                                            +
                                                                            +
                                                                            +
                                                                            + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/accelerators/index.html b/main/reference/edspdf/accelerators/index.html new file mode 100644 index 00000000..0987fee8 --- /dev/null +++ b/main/reference/edspdf/accelerators/index.html @@ -0,0 +1,2358 @@ + + + + + + + + + + + + + + + + + + + + + + accelerators - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                            + +
                                                                            + + + + + + + + +
                                                                            + + +
                                                                            + +
                                                                            + + + + + + +
                                                                            +
                                                                            + + + +
                                                                            +
                                                                            +
                                                                            + + + + +
                                                                            +
                                                                            +
                                                                            + + + +
                                                                            +
                                                                            +
                                                                            + + + +
                                                                            +
                                                                            +
                                                                            + + + +
                                                                            +
                                                                            + + + + + + + +

                                                                            edspdf.accelerators

                                                                            + + +
                                                                            + + + + +
                                                                            + + + +
                                                                            + + + + + + + + + + + +
                                                                            + +
                                                                            + +
                                                                            +

                                                                              + + + + + + +
                                                                              +
                                                                              + + +
                                                                              + +
                                                                              + + + +
                                                                              +
                                                                              +
                                                                              +
                                                                              + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/accelerators/multiprocessing/index.html b/main/reference/edspdf/accelerators/multiprocessing/index.html new file mode 100644 index 00000000..1682bc7d --- /dev/null +++ b/main/reference/edspdf/accelerators/multiprocessing/index.html @@ -0,0 +1,2696 @@ + + + + + + + + + + + + + + + + + + + + + + multiprocessing - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                              + +
                                                                              + + + + + + + + +
                                                                              + + +
                                                                              + +
                                                                              + + + + + + +
                                                                              +
                                                                              + + + +
                                                                              +
                                                                              +
                                                                              + + + + +
                                                                              +
                                                                              +
                                                                              + + + +
                                                                              +
                                                                              +
                                                                              + + + +
                                                                              +
                                                                              +
                                                                              + + + +
                                                                              +
                                                                              + + + + + + + +

                                                                              edspdf.accelerators.multiprocessing

                                                                              + + +
                                                                              + + + + +
                                                                              + + + +
                                                                              + + + + + + +
                                                                              + + + + +

                                                                              +MultiprocessingAccelerator + +

                                                                              + + +
                                                                              +

                                                                              + Bases: Accelerator

                                                                              + + +

                                                                              If you have multiple CPU cores, and optionally multiple GPUs, we provide a +multiprocessing accelerator that allows to run the inference on multiple +processes.

                                                                              +

                                                                              This accelerator dispatches the batches between multiple workers +(data-parallelism), and distribute the computation of a given batch on one or two +workers (model-parallelism). This is done by creating two types of workers:

                                                                              +
                                                                                +
                                                                              • a CPUWorker which handles the non deep-learning components and the + preprocessing, collating and postprocessing of deep-learning components
                                                                              • +
                                                                              • a GPUWorker which handles the forward call of the deep-learning components
                                                                              • +
                                                                              +

                                                                              The advantage of dedicating a worker to the deep-learning components is that it +allows to prepare multiple batches in parallel in multiple CPUWorker, and ensure +that the GPUWorker never wait for a batch to be ready.

                                                                              +

                                                                              The overall architecture described in the following figure, for 3 CPU workers and 2 +GPU workers.

                                                                              +
                                                                              + +
                                                                              + +

                                                                              Here is how a small pipeline with rule-based components and deep-learning components +is distributed between the workers:

                                                                              +
                                                                              + +
                                                                              +

                                                                              Examples

                                                                              +
                                                                              docs = list(
                                                                              +    pipeline.pipe(
                                                                              +        [content1, content2, ...],
                                                                              +        accelerator={
                                                                              +            "@accelerator": "multiprocessing",
                                                                              +            "num_cpu_workers": 3,
                                                                              +            "num_gpu_workers": 2,
                                                                              +            "batch_size": 8,
                                                                              +        },
                                                                              +    )
                                                                              +)
                                                                              +
                                                                              + +

                                                                              Parameters

                                                                              + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                              PARAMETERDESCRIPTION
                                                                              batch_size +

                                                                              Number of documents to process at a time in a CPU/GPU worker

                                                                              +

                                                                              + + TYPE: + int + +

                                                                              +
                                                                              num_cpu_workers +

                                                                              Number of CPU workers. A CPU worker handles the non deep-learning components +and the preprocessing, collating and postprocessing of deep-learning components.

                                                                              +

                                                                              + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                                                              +
                                                                              num_gpu_workers +

                                                                              Number of GPU workers. A GPU worker handles the forward call of the +deep-learning components.

                                                                              +

                                                                              + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                                                              +
                                                                              gpu_pipe_names +

                                                                              List of pipe names to accelerate on a GPUWorker, defaults to all pipes +that inherit from TrainablePipe

                                                                              +

                                                                              + + TYPE: + Optional[List[str]] + + + DEFAULT: + None + +

                                                                              +
                                                                              + + + + + +
                                                                              + + + + + + + + + +
                                                                              + + + +

                                                                              +__call__ + +

                                                                              + + +
                                                                              + +

                                                                              Stream of documents to process. Each document can be a string or a tuple

                                                                              + + + + + + + + + + + + + + + + + + +
                                                                              PARAMETERDESCRIPTION
                                                                              inputs + +

                                                                              + + TYPE: + Iterable[Any] + +

                                                                              +
                                                                              model + +

                                                                              + + TYPE: + Any + +

                                                                              +
                                                                              + + + + + + + + + + + + + + + + +
                                                                              YIELDSDESCRIPTION
                                                                              + + Any + + +
                                                                              +

                                                                              Processed outputs of the pipeline

                                                                              +
                                                                              +
                                                                              + +
                                                                              + +
                                                                              + + + +
                                                                              + +
                                                                              + +
                                                                              + + + + +
                                                                              + +
                                                                              + +
                                                                              +

                                                                                + + + + + + +
                                                                                +
                                                                                + + +
                                                                                + +
                                                                                + + + +
                                                                                +
                                                                                +
                                                                                +
                                                                                + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/accelerators/simple/index.html b/main/reference/edspdf/accelerators/simple/index.html new file mode 100644 index 00000000..68272742 --- /dev/null +++ b/main/reference/edspdf/accelerators/simple/index.html @@ -0,0 +1,2551 @@ + + + + + + + + + + + + + + + + + + + + + + simple - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                + +
                                                                                + + + + + + + + +
                                                                                + + +
                                                                                + +
                                                                                + + + + + + +
                                                                                +
                                                                                + + + +
                                                                                +
                                                                                +
                                                                                + + + + +
                                                                                +
                                                                                +
                                                                                + + + +
                                                                                +
                                                                                +
                                                                                + + + +
                                                                                +
                                                                                +
                                                                                + + + +
                                                                                +
                                                                                + + + + + + + +

                                                                                edspdf.accelerators.simple

                                                                                + + +
                                                                                + + + + +
                                                                                + + + +
                                                                                + + + + + + +
                                                                                + + + + +

                                                                                +SimpleAccelerator + +

                                                                                + + +
                                                                                +

                                                                                + Bases: Accelerator

                                                                                + + +

                                                                                This is the simplest accelerator which batches the documents and process each batch +on the main process (the one calling .pipe()).

                                                                                +

                                                                                Examples

                                                                                +
                                                                                docs = list(pipeline.pipe([content1, content2, ...]))
                                                                                +
                                                                                +

                                                                                or, if you want to override the model defined batch size

                                                                                +
                                                                                docs = list(pipeline.pipe([content1, content2, ...], batch_size=8))
                                                                                +
                                                                                +

                                                                                which is equivalent to passing a confit dict

                                                                                +
                                                                                docs = list(
                                                                                +    pipeline.pipe(
                                                                                +        [content1, content2, ...],
                                                                                +        accelerator={
                                                                                +            "@accelerator": "simple",
                                                                                +            "batch_size": 8,
                                                                                +        },
                                                                                +    )
                                                                                +)
                                                                                +
                                                                                +

                                                                                or the instantiated accelerator directly

                                                                                +
                                                                                from edspdf.accelerators.simple import SimpleAccelerator
                                                                                +
                                                                                +accelerator = SimpleAccelerator(batch_size=8)
                                                                                +docs = list(pipeline.pipe([content1, content2, ...], accelerator=accelerator))
                                                                                +
                                                                                +

                                                                                If you have a GPU, make sure to move the model to the appropriate device before +calling .pipe(). If you have multiple GPUs, use the +multiprocessing +accelerator instead.

                                                                                +
                                                                                pipeline.to("cuda")
                                                                                +docs = list(pipeline.pipe([content1, content2, ...]))
                                                                                +
                                                                                + +

                                                                                Parameters

                                                                                + + + + + + + + + + + + + +
                                                                                PARAMETERDESCRIPTION
                                                                                batch_size +

                                                                                The number of documents to process in each batch.

                                                                                +

                                                                                + + TYPE: + int + + + DEFAULT: + 32 + +

                                                                                +
                                                                                + + + + + +
                                                                                + + + + + + + + + + + +
                                                                                + +
                                                                                + +
                                                                                + + + + +
                                                                                + +
                                                                                + +
                                                                                +

                                                                                  + + + + + + +
                                                                                  +
                                                                                  + + +
                                                                                  + +
                                                                                  + + + +
                                                                                  +
                                                                                  +
                                                                                  +
                                                                                  + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/index.html b/main/reference/edspdf/index.html new file mode 100644 index 00000000..f8f0426f --- /dev/null +++ b/main/reference/edspdf/index.html @@ -0,0 +1,2356 @@ + + + + + + + + + + + + + + + + + + + + + + edspdf - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                  + +
                                                                                  + + + + + + + + +
                                                                                  + + +
                                                                                  + +
                                                                                  + + + + + + +
                                                                                  +
                                                                                  + + + +
                                                                                  +
                                                                                  +
                                                                                  + + + + +
                                                                                  +
                                                                                  +
                                                                                  + + + +
                                                                                  +
                                                                                  +
                                                                                  + + + +
                                                                                  +
                                                                                  +
                                                                                  + + + +
                                                                                  +
                                                                                  + + + + + + + +

                                                                                  edspdf

                                                                                  + + +
                                                                                  + + + + +
                                                                                  + + + +
                                                                                  + + + + + + + + + + + +
                                                                                  + +
                                                                                  + +
                                                                                  +

                                                                                    + + + + + + +
                                                                                    +
                                                                                    + + +
                                                                                    + +
                                                                                    + + + +
                                                                                    +
                                                                                    +
                                                                                    +
                                                                                    + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/layers/box_transformer/index.html b/main/reference/edspdf/layers/box_transformer/index.html new file mode 100644 index 00000000..8c3f2a54 --- /dev/null +++ b/main/reference/edspdf/layers/box_transformer/index.html @@ -0,0 +1,3096 @@ + + + + + + + + + + + + + + + + + + + + + + box_transformer - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                    + +
                                                                                    + + + + + + + + +
                                                                                    + + +
                                                                                    + +
                                                                                    + + + + + + +
                                                                                    +
                                                                                    + + + +
                                                                                    +
                                                                                    +
                                                                                    + + + + +
                                                                                    +
                                                                                    +
                                                                                    + + + +
                                                                                    +
                                                                                    +
                                                                                    + + + +
                                                                                    +
                                                                                    +
                                                                                    + + + +
                                                                                    +
                                                                                    + + + + + + + +

                                                                                    edspdf.layers.box_transformer

                                                                                    + + +
                                                                                    + + + + +
                                                                                    + + + +
                                                                                    + + + + + + +
                                                                                    + + + + +

                                                                                    +BoxTransformerLayer + +

                                                                                    + + +
                                                                                    +

                                                                                    + Bases: Module

                                                                                    + + +

                                                                                    BoxTransformerLayer combining a self attention layer and a +linear->activation->linear transformation. This layer is used in the +BoxTransformerModule module.

                                                                                    + + + +

                                                                                    Parameters

                                                                                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                    PARAMETERDESCRIPTION
                                                                                    input_size +

                                                                                    Input embedding size

                                                                                    +

                                                                                    + + TYPE: + int + +

                                                                                    +
                                                                                    num_heads +

                                                                                    Number of attention heads in the attention layer

                                                                                    +

                                                                                    + + TYPE: + int + + + DEFAULT: + 2 + +

                                                                                    +
                                                                                    dropout_p +

                                                                                    Dropout probability both for the attention layer and embedding projections

                                                                                    +

                                                                                    + + TYPE: + float + + + DEFAULT: + 0.0 + +

                                                                                    +
                                                                                    head_size +

                                                                                    Head sizes of the attention layer

                                                                                    +

                                                                                    + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                                                                    +
                                                                                    activation +

                                                                                    Activation function used in the linear->activation->linear transformation

                                                                                    +

                                                                                    + + TYPE: + ActivationFunction + + + DEFAULT: + 'gelu' + +

                                                                                    +
                                                                                    init_resweight +

                                                                                    Initial weight of the residual gates. +At 0, the layer acts (initially) as an identity function, and at 1 as +a standard Transformer layer. +Initializing with a value close to 0 can help the training converge.

                                                                                    +

                                                                                    + + TYPE: + float + + + DEFAULT: + 0.0 + +

                                                                                    +
                                                                                    attention_mode +

                                                                                    Mode of relative position infused attention layer. +See the +relative attention +documentation for more information.

                                                                                    +

                                                                                    + + TYPE: + Sequence[Literal['c2c', 'c2p', 'p2c']] + + + DEFAULT: + ('c2c', 'c2p', 'p2c') + +

                                                                                    +
                                                                                    position_embedding +

                                                                                    Position embedding to use as key/query position embedding in the attention +computation.

                                                                                    +

                                                                                    + + TYPE: + Optional[Union[FloatTensor, Parameter]] + + + DEFAULT: + None + +

                                                                                    +
                                                                                    + + + + +
                                                                                    + + + + + + + + + +
                                                                                    + + + +

                                                                                    +forward + +

                                                                                    + + +
                                                                                    + +

                                                                                    Forward pass of the BoxTransformerLayer

                                                                                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                    PARAMETERDESCRIPTION
                                                                                    embeds +

                                                                                    Embeddings to contextualize +Shape: n_samples * n_keys * input_size

                                                                                    +

                                                                                    + + TYPE: + FloatTensor + +

                                                                                    +
                                                                                    mask +

                                                                                    Mask of the embeddings. 0 means padding element. +Shape: n_samples * n_keys

                                                                                    +

                                                                                    + + TYPE: + BoolTensor + +

                                                                                    +
                                                                                    relative_positions +

                                                                                    Position of the keys relatively to the query elements +Shape: n_samples * n_queries * n_keys * n_coordinates (2 for x/y)

                                                                                    +

                                                                                    + + TYPE: + LongTensor + +

                                                                                    +
                                                                                    no_position_mask +

                                                                                    Key / query pairs for which the position attention terms should +be disabled. +Shape: n_samples * n_queries * n_keys

                                                                                    +

                                                                                    + + TYPE: + Optional[BoolTensor] + + + DEFAULT: + None + +

                                                                                    +
                                                                                    + + + + + + + + + + + + + + + + +
                                                                                    RETURNSDESCRIPTION
                                                                                    + + Tuple[FloatTensor, FloatTensor] + + +
                                                                                    +
                                                                                      +
                                                                                    • Contextualized embeddings + Shape: n_samples * n_queries * n_keys
                                                                                    • +
                                                                                    • Attention logits + Shape: n_samples * n_queries * n_keys * n_heads
                                                                                    • +
                                                                                    +
                                                                                    +
                                                                                    + +
                                                                                    + +
                                                                                    + + + +
                                                                                    + +
                                                                                    + +
                                                                                    + + + + +

                                                                                    +BoxTransformerModule + +

                                                                                    + + +
                                                                                    +

                                                                                    + Bases: Module

                                                                                    + + + +

                                                                                    Box Transformer architecture combining a multiple +BoxTransformerLayer +modules. It is mainly used in +BoxTransformer.

                                                                                    + +

                                                                                    Parameters

                                                                                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                    PARAMETERDESCRIPTION
                                                                                    input_size +

                                                                                    Input embedding size

                                                                                    +

                                                                                    + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                                                                    +
                                                                                    num_heads +

                                                                                    Number of attention heads in the attention layers

                                                                                    +

                                                                                    + + TYPE: + int + + + DEFAULT: + 2 + +

                                                                                    +
                                                                                    n_relative_positions +

                                                                                    Maximum range of embeddable relative positions between boxes (further +distances are capped to ±n_relative_positions // 2)

                                                                                    +

                                                                                    + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                                                                    +
                                                                                    dropout_p +

                                                                                    Dropout probability both for the attention layers and embedding projections

                                                                                    +

                                                                                    + + TYPE: + float + + + DEFAULT: + 0.0 + +

                                                                                    +
                                                                                    head_size +

                                                                                    Head sizes of the attention layers

                                                                                    +

                                                                                    + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                                                                    +
                                                                                    activation +

                                                                                    Activation function used in the linear->activation->linear transformations

                                                                                    +

                                                                                    + + TYPE: + ActivationFunction + + + DEFAULT: + 'gelu' + +

                                                                                    +
                                                                                    init_resweight +

                                                                                    Initial weight of the residual gates. +At 0, the layer acts (initially) as an identity function, and at 1 as +a standard Transformer layer. +Initializing with a value close to 0 can help the training converge.

                                                                                    +

                                                                                    + + TYPE: + float + + + DEFAULT: + 0.0 + +

                                                                                    +
                                                                                    attention_mode +

                                                                                    Mode of relative position infused attention layer. +See the +relative attention +documentation for more information.

                                                                                    +

                                                                                    + + TYPE: + Sequence[Literal['c2c', 'c2p', 'p2c']] + + + DEFAULT: + ('c2c', 'c2p', 'p2c') + +

                                                                                    +
                                                                                    n_layers +

                                                                                    Number of layers in the Transformer

                                                                                    +

                                                                                    + + TYPE: + int + + + DEFAULT: + 2 + +

                                                                                    +
                                                                                    + + + + +
                                                                                    + + + + + + + + + +
                                                                                    + + + +

                                                                                    +forward + +

                                                                                    + + +
                                                                                    + +

                                                                                    Forward pass of the BoxTransformer

                                                                                    + + + + + + + + + + + + + + + + + + +
                                                                                    PARAMETERDESCRIPTION
                                                                                    embeds +

                                                                                    Embeddings to contextualize +Shape: n_samples * n_keys * input_size

                                                                                    +

                                                                                    + + TYPE: + FoldedTensor + +

                                                                                    +
                                                                                    boxes +

                                                                                    Layout features of the input elements

                                                                                    +

                                                                                    + + TYPE: + Dict + +

                                                                                    +
                                                                                    + + + + + + + + + + + + + + + + +
                                                                                    RETURNSDESCRIPTION
                                                                                    + + Tuple[FloatTensor, List[FloatTensor]] + + +
                                                                                    +
                                                                                      +
                                                                                    • Output of the last BoxTransformerLayer + Shape: n_samples * n_queries * n_keys
                                                                                    • +
                                                                                    • Attention logits of all layers + Shape: n_samples * n_queries * n_keys * n_heads
                                                                                    • +
                                                                                    +
                                                                                    +
                                                                                    + +
                                                                                    + +
                                                                                    + + + +
                                                                                    + +
                                                                                    + +
                                                                                    + + + + +
                                                                                    + +
                                                                                    + +
                                                                                    +

                                                                                      + + + + + + +
                                                                                      +
                                                                                      + + +
                                                                                      + +
                                                                                      + + + +
                                                                                      +
                                                                                      +
                                                                                      +
                                                                                      + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/layers/index.html b/main/reference/edspdf/layers/index.html new file mode 100644 index 00000000..64496d1e --- /dev/null +++ b/main/reference/edspdf/layers/index.html @@ -0,0 +1,2358 @@ + + + + + + + + + + + + + + + + + + + + + + layers - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                      + +
                                                                                      + + + + + + + + +
                                                                                      + + +
                                                                                      + +
                                                                                      + + + + + + +
                                                                                      +
                                                                                      + + + +
                                                                                      +
                                                                                      +
                                                                                      + + + + +
                                                                                      +
                                                                                      +
                                                                                      + + + +
                                                                                      +
                                                                                      +
                                                                                      + + + +
                                                                                      +
                                                                                      +
                                                                                      + + + +
                                                                                      +
                                                                                      + + + + + + + +

                                                                                      edspdf.layers

                                                                                      + + +
                                                                                      + + + + +
                                                                                      + + + +
                                                                                      + + + + + + + + + + + +
                                                                                      + +
                                                                                      + +
                                                                                      +

                                                                                        + + + + + + +
                                                                                        +
                                                                                        + + +
                                                                                        + +
                                                                                        + + + +
                                                                                        +
                                                                                        +
                                                                                        +
                                                                                        + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/layers/relative_attention/index.html b/main/reference/edspdf/layers/relative_attention/index.html new file mode 100644 index 00000000..8394472e --- /dev/null +++ b/main/reference/edspdf/layers/relative_attention/index.html @@ -0,0 +1,2954 @@ + + + + + + + + + + + + + + + + + + + + + + relative_attention - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                        + +
                                                                                        + + + + + + + + +
                                                                                        + + +
                                                                                        + +
                                                                                        + + + + + + +
                                                                                        +
                                                                                        + + + +
                                                                                        +
                                                                                        +
                                                                                        + + + + +
                                                                                        +
                                                                                        +
                                                                                        + + + +
                                                                                        +
                                                                                        +
                                                                                        + + + +
                                                                                        +
                                                                                        +
                                                                                        + + + +
                                                                                        +
                                                                                        + + + + + + + +

                                                                                        edspdf.layers.relative_attention

                                                                                        + + +
                                                                                        + + + + +
                                                                                        + + + +
                                                                                        + + + + + + +
                                                                                        + + + + +

                                                                                        +RelativeAttention + +

                                                                                        + + +
                                                                                        +

                                                                                        + Bases: Module

                                                                                        + + +

                                                                                        A self/cross-attention layer that takes relative position of elements into +account to compute the attention weights. +When running a relative attention layer, key and queries are represented using +content and position embeddings, where position embeddings are retrieved using +the relative position of keys relative to queries

                                                                                        + + + +

                                                                                        Parameters

                                                                                        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                        PARAMETERDESCRIPTION
                                                                                        size +

                                                                                        The size of the output embeddings +Also serves as default if query_size, pos_size, or key_size is None

                                                                                        +

                                                                                        + + TYPE: + int + +

                                                                                        +
                                                                                        n_heads +

                                                                                        The number of attention heads

                                                                                        +

                                                                                        + + TYPE: + int + +

                                                                                        +
                                                                                        query_size +

                                                                                        The size of the query embeddings.

                                                                                        +

                                                                                        + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                                                                        +
                                                                                        key_size +

                                                                                        The size of the key embeddings.

                                                                                        +

                                                                                        + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                                                                        +
                                                                                        value_size +

                                                                                        The size of the value embeddings

                                                                                        +

                                                                                        + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                                                                        +
                                                                                        head_size +

                                                                                        The size of each query / key / value chunk used in the attention dot product +Default: key_size / n_heads

                                                                                        +

                                                                                        + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                                                                        +
                                                                                        position_embedding +

                                                                                        The position embedding used as key and query embeddings

                                                                                        +

                                                                                        + + TYPE: + Optional[Union[FloatTensor, Parameter]] + + + DEFAULT: + None + +

                                                                                        +
                                                                                        dropout_p +

                                                                                        Dropout probability applied on the attention weights +Default: 0.1

                                                                                        +

                                                                                        + + TYPE: + float + + + DEFAULT: + 0.0 + +

                                                                                        +
                                                                                        same_key_query_proj +

                                                                                        Whether to use the same projection operator for content key and queries +when computing the pre-attention key and query embedding chunks +Default: False

                                                                                        +

                                                                                        + + TYPE: + bool + + + DEFAULT: + False + +

                                                                                        +
                                                                                        same_positional_key_query_proj +

                                                                                        Whether to use the same projection operator for content key and queries +when computing the pre-attention key and query embedding chunks +Default: False

                                                                                        +

                                                                                        + + TYPE: + bool + + + DEFAULT: + False + +

                                                                                        +
                                                                                        n_coordinates +

                                                                                        The number of positional coordinates +For instance, text is 1D so 1 coordinate, images are 2D so 2 coordinates ... +Default: 1

                                                                                        +

                                                                                        + + TYPE: + int + + + DEFAULT: + 1 + +

                                                                                        +
                                                                                        head_bias +

                                                                                        Whether to learn a bias term to add to the attention logits +This is only useful if you plan to use the attention logits for subsequent +operations, since attention weights are unaffected by bias terms.

                                                                                        +

                                                                                        + + TYPE: + bool + + + DEFAULT: + True + +

                                                                                        +
                                                                                        do_pooling +

                                                                                        Whether to compute the output embedding. +If you only plan to use attention logits, you should disable this parameter. +Default: True

                                                                                        +

                                                                                        + + TYPE: + bool + + + DEFAULT: + True + +

                                                                                        +
                                                                                        mode +

                                                                                        Whether to compute content to content (c2c), content to position (c2p) +or position to content (p2c) attention terms. +Setting mode=('c2c") disable relative position attention terms: this is +the standard attention layer. +To get a better intuition about these different types of attention, here is +a formulation as fictitious search samples from a word in a (1D) text:

                                                                                        +
                                                                                          +
                                                                                        • content-content : "my content is ’ultrasound’ so I’m looking for other + words whose content contains information about temporality"
                                                                                        • +
                                                                                        • content-position: "my content is ’ultrasound’ so I’m looking for other + words that are 3 positions after of me"
                                                                                        • +
                                                                                        • position-content : "regardless of my content, I will attend to the word + one position after from me if it contains information about temporality, + two words after me if it contains information about location, etc."
                                                                                        • +
                                                                                        +

                                                                                        + + TYPE: + Sequence[Literal['c2c', 'c2p', 'p2c']] + + + DEFAULT: + ('c2c', 'p2c', 'c2p') + +

                                                                                        +
                                                                                        n_additional_heads +

                                                                                        The number of additional head logits to compute. +Those are not used to compute output embeddings, but may be useful in +subsequent operation. +Default: 0

                                                                                        +

                                                                                        + + TYPE: + int + + + DEFAULT: + 0 + +

                                                                                        +
                                                                                        + + + + +
                                                                                        + + + + + + + + + +
                                                                                        + + + +

                                                                                        +forward + +

                                                                                        + + +
                                                                                        + +

                                                                                        Forward pass of the RelativeAttention layer.

                                                                                        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                        PARAMETERDESCRIPTION
                                                                                        content_queries +

                                                                                        The content query embedding to use in the attention computation +Shape: n_samples * n_queries * query_size

                                                                                        +

                                                                                        + + TYPE: + FloatTensor + +

                                                                                        +
                                                                                        content_keys +

                                                                                        The content key embedding to use in the attention computation. +If None, defaults to the content_queries +Shape: n_samples * n_keys * query_size

                                                                                        +

                                                                                        + + TYPE: + Optional[FloatTensor] + + + DEFAULT: + None + +

                                                                                        +
                                                                                        content_values +

                                                                                        The content values embedding to use in the final pooling computation. +If None, pooling won't be performed. +Shape: n_samples * n_keys * query_size

                                                                                        +

                                                                                        + + TYPE: + Optional[FloatTensor] + + + DEFAULT: + None + +

                                                                                        +
                                                                                        mask +

                                                                                        The content key embedding to use in the attention computation. +If None, defaults to the content_queries +Shape: either +- n_samples * n_keys +- n_samples * n_queries * n_keys +- n_samples * n_queries * n_keys * n_heads

                                                                                        +

                                                                                        + + TYPE: + Optional[BoolTensor] + + + DEFAULT: + None + +

                                                                                        +
                                                                                        relative_positions +

                                                                                        The relative position of keys relative to queries +If None, positional attention terms won't be computed. +Shape: n_samples * n_queries * n_keys * n_coordinates

                                                                                        +

                                                                                        + + TYPE: + Optional[LongTensor] + + + DEFAULT: + None + +

                                                                                        +
                                                                                        no_position_mask +

                                                                                        Key / query pairs for which the position attention terms should +be disabled. +Shape: n_samples * n_queries * n_keys

                                                                                        +

                                                                                        + + TYPE: + Optional[BoolTensor] + + + DEFAULT: + None + +

                                                                                        +
                                                                                        base_attn +

                                                                                        Attention logits to add to the computed attention logits +Shape: n_samples * n_queries * n_keys * n_heads

                                                                                        +

                                                                                        + + TYPE: + Optional[FloatTensor] + + + DEFAULT: + None + +

                                                                                        +
                                                                                        + + + + + + + + + + + + + + + + +
                                                                                        RETURNSDESCRIPTION
                                                                                        + + Union[Tuple[FloatTensor, FloatTensor], FloatTensor] + + +
                                                                                        +
                                                                                          +
                                                                                        • the output contextualized embeddings (only if content_values is not None + and the do_pooling attribute is set to True) + Shape: n_sample * n_keys * size
                                                                                        • +
                                                                                        • the attention logits + Shape: n_sample * n_keys * n_queries * (n_heads + n_additional_heads)
                                                                                        • +
                                                                                        +
                                                                                        +
                                                                                        + +
                                                                                        + +
                                                                                        + + + +
                                                                                        + +
                                                                                        + +
                                                                                        + + + + +
                                                                                        + +
                                                                                        + +
                                                                                        +

                                                                                          + + + + + + +
                                                                                          +
                                                                                          + + +
                                                                                          + +
                                                                                          + + + +
                                                                                          +
                                                                                          +
                                                                                          +
                                                                                          + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/layers/sinusoidal_embedding/index.html b/main/reference/edspdf/layers/sinusoidal_embedding/index.html new file mode 100644 index 00000000..a8bdf74c --- /dev/null +++ b/main/reference/edspdf/layers/sinusoidal_embedding/index.html @@ -0,0 +1,2620 @@ + + + + + + + + + + + + + + + + + + + + + + sinusoidal_embedding - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                          + +
                                                                                          + + + + + + + + +
                                                                                          + + +
                                                                                          + +
                                                                                          + + + + + + +
                                                                                          +
                                                                                          + + + +
                                                                                          +
                                                                                          +
                                                                                          + + + + +
                                                                                          +
                                                                                          +
                                                                                          + + + +
                                                                                          +
                                                                                          +
                                                                                          + + + +
                                                                                          +
                                                                                          +
                                                                                          + + + +
                                                                                          +
                                                                                          + + + + + + + +

                                                                                          edspdf.layers.sinusoidal_embedding

                                                                                          + + +
                                                                                          + + + + +
                                                                                          + + + +
                                                                                          + + + + + + +
                                                                                          + + + + +

                                                                                          +SinusoidalEmbedding + +

                                                                                          + + +
                                                                                          +

                                                                                          + Bases: Module

                                                                                          + + +

                                                                                          A position embedding lookup table that stores embeddings for a fixed number +of positions. +The value of each of the embedding_dim channels of the generated embedding +is generated according to a trigonometric function (sin for even channels, +cos for odd channels). +The frequency of the signal in each pair of channels varies according to the +temperature parameter.

                                                                                          +

                                                                                          Any input position above the maximum value num_embeddings will be capped to +num_embeddings - 1

                                                                                          + + + +

                                                                                          Parameters

                                                                                          + + + + + + + + + + + + + + + + + + + + + +
                                                                                          PARAMETERDESCRIPTION
                                                                                          num_embeddings +

                                                                                          The maximum number of position embeddings store in this table

                                                                                          +

                                                                                          + + TYPE: + int + +

                                                                                          +
                                                                                          embedding_dim +

                                                                                          The embedding size

                                                                                          +

                                                                                          + + TYPE: + int + +

                                                                                          +
                                                                                          temperature +

                                                                                          The temperature controls the range of frequencies used by each +channel of the embedding

                                                                                          +

                                                                                          + + TYPE: + float + + + DEFAULT: + 10000.0 + +

                                                                                          +
                                                                                          + + + + +
                                                                                          + + + + + + + + + +
                                                                                          + + + +

                                                                                          +forward + +

                                                                                          + + +
                                                                                          + +

                                                                                          Forward pass of the SinusoidalEmbedding module

                                                                                          + + + + + + + + + + + + + + +
                                                                                          PARAMETERDESCRIPTION
                                                                                          indices +

                                                                                          Shape: any

                                                                                          +

                                                                                          + + TYPE: + LongTensor + +

                                                                                          +
                                                                                          + + + + + + + + + + + + + + + + +
                                                                                          RETURNSDESCRIPTION
                                                                                          + + FloatTensor + + +
                                                                                          +

                                                                                          Shape: (*input_shape, embedding_dim)

                                                                                          +
                                                                                          +
                                                                                          + +
                                                                                          + +
                                                                                          + + + +
                                                                                          + +
                                                                                          + +
                                                                                          + + + + +
                                                                                          + +
                                                                                          + +
                                                                                          +

                                                                                            + + + + + + +
                                                                                            +
                                                                                            + + +
                                                                                            + +
                                                                                            + + + +
                                                                                            +
                                                                                            +
                                                                                            +
                                                                                            + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/layers/vocabulary/index.html b/main/reference/edspdf/layers/vocabulary/index.html new file mode 100644 index 00000000..4ad975a6 --- /dev/null +++ b/main/reference/edspdf/layers/vocabulary/index.html @@ -0,0 +1,2716 @@ + + + + + + + + + + + + + + + + + + + + + + vocabulary - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                            + +
                                                                                            + + + + + + + + +
                                                                                            + + +
                                                                                            + +
                                                                                            + + + + + + +
                                                                                            +
                                                                                            + + + +
                                                                                            +
                                                                                            +
                                                                                            + + + + +
                                                                                            +
                                                                                            +
                                                                                            + + + +
                                                                                            +
                                                                                            +
                                                                                            + + + +
                                                                                            +
                                                                                            +
                                                                                            + + + +
                                                                                            +
                                                                                            + + + + + + + +

                                                                                            edspdf.layers.vocabulary

                                                                                            + + +
                                                                                            + + + + +
                                                                                            + + + +
                                                                                            + + + + + + +
                                                                                            + + + + +

                                                                                            +Vocabulary + +

                                                                                            + + +
                                                                                            +

                                                                                            + Bases: Module, Generic[T]

                                                                                            + + +

                                                                                            Vocabulary layer. +This is not meant to be used as a torch.nn.Module but subclassing +torch.nn.Module makes the instances appear when printing a model, which is nice.

                                                                                            + + + +

                                                                                            Parameters

                                                                                            + + + + + + + + + + + + + + + + + +
                                                                                            PARAMETERDESCRIPTION
                                                                                            items +

                                                                                            Initial vocabulary elements if any. +Specific elements such as padding and unk can be set here to enforce their +index in the vocabulary.

                                                                                            +

                                                                                            + + TYPE: + Sequence[T] + + + DEFAULT: + None + +

                                                                                            +
                                                                                            default +

                                                                                            Default index to use for out of vocabulary elements +Defaults to -100

                                                                                            +

                                                                                            + + TYPE: + int + + + DEFAULT: + -100 + +

                                                                                            +
                                                                                            + + + + +
                                                                                            + + + + + + + + + +
                                                                                            + + + +

                                                                                            +initialization + +

                                                                                            + + +
                                                                                            + +

                                                                                            Enters the initialization mode. +Out of vocabulary elements will be assigned an index.

                                                                                            + +
                                                                                            + +
                                                                                            + +
                                                                                            + + + +

                                                                                            +encode + +

                                                                                            + + +
                                                                                            + +

                                                                                            Converts an element into its vocabulary index +If the layer is in its initialization mode (with vocab.initialization(): ...), +and the element is out of vocabulary, a new index will be created and returned. +Otherwise, any oov element will be encoded with the default index.

                                                                                            + + + + + + + + + + + + + + +
                                                                                            PARAMETERDESCRIPTION
                                                                                            item + +

                                                                                            +

                                                                                            +
                                                                                            + + + + + + + + + + + + + + + + +
                                                                                            RETURNSDESCRIPTION
                                                                                            + + int + + +
                                                                                            + +
                                                                                            +
                                                                                            + +
                                                                                            + +
                                                                                            + +
                                                                                            + + + +

                                                                                            +decode + +

                                                                                            + + +
                                                                                            + +

                                                                                            Converts an index into its original value

                                                                                            + + + + + + + + + + + + + + +
                                                                                            PARAMETERDESCRIPTION
                                                                                            idx + +

                                                                                            +

                                                                                            +
                                                                                            + + + + + + + + + + + + + + + + +
                                                                                            RETURNSDESCRIPTION
                                                                                            + + InputT + + +
                                                                                            + +
                                                                                            +
                                                                                            + +
                                                                                            + +
                                                                                            + + + +
                                                                                            + +
                                                                                            + +
                                                                                            + + + + +
                                                                                            + +
                                                                                            + +
                                                                                            +

                                                                                              + + + + + + +
                                                                                              +
                                                                                              + + +
                                                                                              + +
                                                                                              + + + +
                                                                                              +
                                                                                              +
                                                                                              +
                                                                                              + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipeline/index.html b/main/reference/edspdf/pipeline/index.html new file mode 100644 index 00000000..87d5aa13 --- /dev/null +++ b/main/reference/edspdf/pipeline/index.html @@ -0,0 +1,4360 @@ + + + + + + + + + + + + + + + + + + + + + + pipeline - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                              + +
                                                                                              + + + + + + + + +
                                                                                              + + +
                                                                                              + +
                                                                                              + + + + + + +
                                                                                              +
                                                                                              + + + +
                                                                                              +
                                                                                              +
                                                                                              + + + + +
                                                                                              +
                                                                                              +
                                                                                              + + + +
                                                                                              +
                                                                                              +
                                                                                              + + + +
                                                                                              +
                                                                                              +
                                                                                              + + + +
                                                                                              +
                                                                                              + + + + + + + +

                                                                                              edspdf.pipeline

                                                                                              + + +
                                                                                              + + + + +
                                                                                              + + + +
                                                                                              + + + + + + +
                                                                                              + + + + +

                                                                                              +Pipeline + +

                                                                                              + + +
                                                                                              + + +

                                                                                              Pipeline to build hybrid and multitask PDF processing pipeline. +It uses PyTorch as the deep-learning backend and allows components to share +subcomponents.

                                                                                              +

                                                                                              See the documentation for more details.

                                                                                              + + + +

                                                                                              Parameters

                                                                                              + + + + + + + + + + + + + + + + + +
                                                                                              PARAMETERDESCRIPTION
                                                                                              batch_size +

                                                                                              Batch size to use in the .pipe() method

                                                                                              +

                                                                                              + + TYPE: + Optional[int] + + + DEFAULT: + 4 + +

                                                                                              +
                                                                                              meta +

                                                                                              Meta information about the pipeline

                                                                                              +

                                                                                              + + TYPE: + Dict[str, Any] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              + + + + +
                                                                                              + + + + + + + +
                                                                                              + + + + +

                                                                                              + disabled + + + property + + +

                                                                                              + + +
                                                                                              + +

                                                                                              The names of the disabled components

                                                                                              +
                                                                                              + +
                                                                                              + +
                                                                                              + + + + +

                                                                                              + cfg: Config + + + property + + +

                                                                                              + + +
                                                                                              + +

                                                                                              Returns the config of the pipeline, including the config of all components. +Updated from spacy to allow references between components.

                                                                                              +
                                                                                              + +
                                                                                              + + + +
                                                                                              + + + +

                                                                                              +get_pipe + +

                                                                                              + + +
                                                                                              + +

                                                                                              Get a component by its name.

                                                                                              + + + + + + + + + + + + + + +
                                                                                              PARAMETERDESCRIPTION
                                                                                              name +

                                                                                              The name of the component to get.

                                                                                              +

                                                                                              + + TYPE: + str + +

                                                                                              +
                                                                                              + + + + + + + + + + + + + + + + +
                                                                                              RETURNSDESCRIPTION
                                                                                              + + Pipe + + +
                                                                                              + +
                                                                                              +
                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +has_pipe + +

                                                                                              + + +
                                                                                              + +

                                                                                              Check if a component exists in the pipeline.

                                                                                              + + + + + + + + + + + + + + +
                                                                                              PARAMETERDESCRIPTION
                                                                                              name +

                                                                                              The name of the component to check.

                                                                                              +

                                                                                              + + TYPE: + str + +

                                                                                              +
                                                                                              + + + + + + + + + + + + + + + + +
                                                                                              RETURNSDESCRIPTION
                                                                                              + + bool + + +
                                                                                              + +
                                                                                              +
                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +create_pipe + +

                                                                                              + + +
                                                                                              + +

                                                                                              Create a component from a factory name.

                                                                                              + + + + + + + + + + + + + + + + + + + + + + +
                                                                                              PARAMETERDESCRIPTION
                                                                                              factory +

                                                                                              The name of the factory to use

                                                                                              +

                                                                                              + + TYPE: + str + +

                                                                                              +
                                                                                              name +

                                                                                              The name of the component

                                                                                              +

                                                                                              + + TYPE: + str + +

                                                                                              +
                                                                                              config +

                                                                                              The config to pass to the factory

                                                                                              +

                                                                                              + + TYPE: + Dict[str, Any] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              + + + + + + + + + + + + + + + + +
                                                                                              RETURNSDESCRIPTION
                                                                                              + + Pipe + + +
                                                                                              + +
                                                                                              +
                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +add_pipe + +

                                                                                              + + +
                                                                                              + +

                                                                                              Add a component to the pipeline.

                                                                                              + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                              PARAMETERDESCRIPTION
                                                                                              factory +

                                                                                              The name of the component to add or the component itself

                                                                                              +

                                                                                              + + TYPE: + Union[str, Pipe] + +

                                                                                              +
                                                                                              name +

                                                                                              The name of the component. If not provided, the name of the component +will be used if it has one (.name), otherwise the factory name will be used.

                                                                                              +

                                                                                              + + TYPE: + Optional[str] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              first +

                                                                                              Whether to add the component to the beginning of the pipeline. This argument +is mutually exclusive with before and after.

                                                                                              +

                                                                                              + + TYPE: + bool + + + DEFAULT: + False + +

                                                                                              +
                                                                                              before +

                                                                                              The name of the component to add the new component before. This argument is +mutually exclusive with after and first.

                                                                                              +

                                                                                              + + TYPE: + Optional[str] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              after +

                                                                                              The name of the component to add the new component after. This argument is +mutually exclusive with before and first.

                                                                                              +

                                                                                              + + TYPE: + Optional[str] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              config +

                                                                                              The arguments to pass to the component factory.

                                                                                              +

                                                                                              Note that instead of replacing arguments with the same keys, the config +will be merged with the default config of the component. This means that +you can override specific nested arguments without having to specify the +entire config.

                                                                                              +

                                                                                              + + TYPE: + Optional[Dict[str, Any]] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              + + + + + + + + + + + + + + + + +
                                                                                              RETURNSDESCRIPTION
                                                                                              + + Pipe + + +
                                                                                              +

                                                                                              The component that was added to the pipeline.

                                                                                              +
                                                                                              +
                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +__call__ + +

                                                                                              + + +
                                                                                              + +

                                                                                              Apply each component successively on a document.

                                                                                              + + + + + + + + + + + + + + +
                                                                                              PARAMETERDESCRIPTION
                                                                                              doc +

                                                                                              The doc to create the PDFDoc from, or a PDFDoc.

                                                                                              +

                                                                                              + + TYPE: + Any + +

                                                                                              +
                                                                                              + + + + + + + + + + + + + + + + +
                                                                                              RETURNSDESCRIPTION
                                                                                              + + PDFDoc + + +
                                                                                              + +
                                                                                              +
                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +pipe + +

                                                                                              + + +
                                                                                              + +

                                                                                              Process a stream of documents by applying each component successively on +batches of documents.

                                                                                              + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                              PARAMETERDESCRIPTION
                                                                                              inputs +

                                                                                              The inputs to create the PDFDocs from, or the PDFDocs directly.

                                                                                              +

                                                                                              + + TYPE: + Any + +

                                                                                              +
                                                                                              batch_size +

                                                                                              The batch size to use. If not provided, the batch size of the pipeline +object will be used.

                                                                                              +

                                                                                              + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              accelerator +

                                                                                              The accelerator to use for processing the documents. If not provided, +the default accelerator will be used.

                                                                                              +

                                                                                              + + TYPE: + Optional[Union[str, Accelerator]] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              to_doc +

                                                                                              The function to use to convert the inputs to PDFDoc objects. By default, +the content field of the inputs will be used if dict-like objects are +provided, otherwise the inputs will be passed directly to the pipeline.

                                                                                              +

                                                                                              + + TYPE: + Optional[ToDoc] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              from_doc +

                                                                                              The function to use to convert the PDFDoc objects to outputs. By default, +the PDFDoc objects will be returned directly.

                                                                                              +

                                                                                              + + TYPE: + FromDoc + + + DEFAULT: + lambda : doc + +

                                                                                              +
                                                                                              + + + + + + + + + + + + + + + + +
                                                                                              RETURNSDESCRIPTION
                                                                                              + + Iterable[PDFDoc] + + +
                                                                                              + +
                                                                                              +
                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +cache + +

                                                                                              + + +
                                                                                              + +

                                                                                              Enable caching for all (trainable) components in the pipeline

                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +trainable_pipes + +

                                                                                              + + +
                                                                                              + +

                                                                                              Yields components that are PyTorch modules.

                                                                                              + + + + + + + + + + + + + + +
                                                                                              PARAMETERDESCRIPTION
                                                                                              disable +

                                                                                              The names of disabled components, which will be skipped.

                                                                                              +

                                                                                              + + TYPE: + Sequence[str] + + + DEFAULT: + () + +

                                                                                              +
                                                                                              + + + + + + + + + + + + + + + + +
                                                                                              RETURNSDESCRIPTION
                                                                                              + + Iterable[Tuple[str, TrainablePipe]] + + +
                                                                                              + +
                                                                                              +
                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +post_init + +

                                                                                              + + +
                                                                                              + +

                                                                                              Completes the initialization of the pipeline by calling the post_init +method of all components that have one. +This is useful for components that need to see some data to build +their vocabulary, for instance.

                                                                                              + + + + + + + + + + + + + + + + + + +
                                                                                              PARAMETERDESCRIPTION
                                                                                              gold_data +

                                                                                              The documents to use for initialization. +Each component will not necessarily see all the data.

                                                                                              +

                                                                                              + + TYPE: + Iterable[PDFDoc] + +

                                                                                              +
                                                                                              exclude +

                                                                                              The names of components to exclude from initialization. +This argument will be gradually updated with the names of initialized +components

                                                                                              +

                                                                                              + + TYPE: + Optional[set] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +from_config + + + classmethod + + +

                                                                                              + + +
                                                                                              + +

                                                                                              Create a pipeline from a config object

                                                                                              + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                              PARAMETERDESCRIPTION
                                                                                              config +

                                                                                              The config to use

                                                                                              +

                                                                                              + + TYPE: + Dict[str, Any] + + + DEFAULT: + {} + +

                                                                                              +
                                                                                              disable +

                                                                                              Components to disable

                                                                                              +

                                                                                              + + TYPE: + Optional[Set[str]] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              enable +

                                                                                              Components to enable

                                                                                              +

                                                                                              + + TYPE: + Optional[Set[str]] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              exclude +

                                                                                              Components to exclude

                                                                                              +

                                                                                              + + TYPE: + Optional[Set[str]] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              meta +

                                                                                              Metadata to add to the pipeline

                                                                                              +

                                                                                              + + TYPE: + Optional[Dict[str, Any]] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              + + + + + + + + + + + + + + + + +
                                                                                              RETURNSDESCRIPTION
                                                                                              + + Pipeline + + +
                                                                                              + +
                                                                                              +
                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +__get_validators__ + + + classmethod + + +

                                                                                              + + +
                                                                                              + +

                                                                                              Pydantic validators generator

                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +validate + + + classmethod + + +

                                                                                              + + +
                                                                                              + +

                                                                                              Pydantic validator, used in the validate_arguments decorated functions

                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +preprocess + +

                                                                                              + + +
                                                                                              + +

                                                                                              Run the preprocessing methods of each component in the pipeline +on a document and returns a dictionary containing the results, with the +component names as keys.

                                                                                              + + + + + + + + + + + + + + + + + + +
                                                                                              PARAMETERDESCRIPTION
                                                                                              doc +

                                                                                              The document to preprocess

                                                                                              +

                                                                                              + + TYPE: + PDFDoc + +

                                                                                              +
                                                                                              supervision +

                                                                                              Whether to include supervision information in the preprocessing

                                                                                              +

                                                                                              + + TYPE: + bool + + + DEFAULT: + False + +

                                                                                              +
                                                                                              + + + + + + + + + + + + + + + + +
                                                                                              RETURNSDESCRIPTION
                                                                                              + + Dict[str, Any] + + +
                                                                                              + +
                                                                                              +
                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +preprocess_many + +

                                                                                              + + +
                                                                                              + +

                                                                                              Runs the preprocessing methods of each component in the pipeline on +a collection of documents and returns an iterable of dictionaries containing +the results, with the component names as keys.

                                                                                              + + + + + + + + + + + + + + + + + + + + + + +
                                                                                              PARAMETERDESCRIPTION
                                                                                              docs + +

                                                                                              + + TYPE: + Iterable[PDFDoc] + +

                                                                                              +
                                                                                              compress +

                                                                                              Whether to deduplicate identical preprocessing outputs of the results +if multiple documents share identical subcomponents. This step is required +to enable the cache mechanism when training or running the pipeline over a +tabular datasets such as pyarrow tables that do not store referential +equality information.

                                                                                              +

                                                                                              + + DEFAULT: + True + +

                                                                                              +
                                                                                              supervision +

                                                                                              Whether to include supervision information in the preprocessing

                                                                                              +

                                                                                              + + DEFAULT: + True + +

                                                                                              +
                                                                                              + + + + + + + + + + + + + + + + +
                                                                                              RETURNSDESCRIPTION
                                                                                              + + Iterable[OutputT] + + +
                                                                                              + +
                                                                                              +
                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +collate + +

                                                                                              + + +
                                                                                              + +

                                                                                              Collates a batch of preprocessed samples into a single (maybe nested) +dictionary of tensors by calling the collate method of each component.

                                                                                              + + + + + + + + + + + + + + + + + + +
                                                                                              PARAMETERDESCRIPTION
                                                                                              batch +

                                                                                              The batch of preprocessed samples

                                                                                              +

                                                                                              + + TYPE: + List[Dict[str, Any]] + +

                                                                                              +
                                                                                              device +

                                                                                              The device to move the tensors to before returning them

                                                                                              +

                                                                                              + + TYPE: + Optional[device] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              + + + + + + + + + + + + + + + + +
                                                                                              RETURNSDESCRIPTION
                                                                                              + + Dict[str, Any] + + +
                                                                                              +

                                                                                              The collated batch

                                                                                              +
                                                                                              +
                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +parameters + +

                                                                                              + + +
                                                                                              + +

                                                                                              Returns an iterator over the Pytorch parameters of the components in the +pipeline

                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +named_parameters + +

                                                                                              + + +
                                                                                              + +

                                                                                              Returns an iterator over the Pytorch parameters of the components in the +pipeline

                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +to + +

                                                                                              + + +
                                                                                              + +

                                                                                              Moves the pipeline to a given device

                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +train + +

                                                                                              + + +
                                                                                              + +

                                                                                              Enables training mode on pytorch modules

                                                                                              + + + + + + + + + + + + + + +
                                                                                              PARAMETERDESCRIPTION
                                                                                              mode +

                                                                                              Whether to enable training or not

                                                                                              +

                                                                                              + + DEFAULT: + True + +

                                                                                              +
                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +save + +

                                                                                              + + +
                                                                                              + +

                                                                                              Save the pipeline to a directory.

                                                                                              + + + + + + + + + + + + + + + + + + +
                                                                                              PARAMETERDESCRIPTION
                                                                                              path +

                                                                                              The path to the directory to save the pipeline to. Every component will be +saved to separated subdirectories of this directory, except for tensors +that will be saved to a shared files depending on the references between +the components.

                                                                                              +

                                                                                              + + TYPE: + Union[str, Path] + +

                                                                                              +
                                                                                              exclude +

                                                                                              The names of the components, or attributes to exclude from the saving +process. This list will be gradually filled in place as components are +saved

                                                                                              +

                                                                                              + + TYPE: + Optional[Set[str]] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +load_state_from_disk + +

                                                                                              + + +
                                                                                              + +

                                                                                              Load the pipeline from a directory. Components will be updated in-place.

                                                                                              + + + + + + + + + + + + + + + + + + +
                                                                                              PARAMETERDESCRIPTION
                                                                                              path +

                                                                                              The path to the directory to load the pipeline from

                                                                                              +

                                                                                              + + TYPE: + Union[str, Path] + +

                                                                                              +
                                                                                              exclude +

                                                                                              The names of the components, or attributes to exclude from the loading +process. This list will be gradually filled in place as components are +loaded

                                                                                              +

                                                                                              + + TYPE: + Set[str] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +

                                                                                              +select_pipes + +

                                                                                              + + +
                                                                                              + +

                                                                                              Temporarily disable and enable components in the pipeline.

                                                                                              + + + + + + + + + + + + + + + + + + +
                                                                                              PARAMETERDESCRIPTION
                                                                                              disable +

                                                                                              The name of the component to disable, or a list of names.

                                                                                              +

                                                                                              + + TYPE: + Optional[Union[str, Iterable[str]]] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              enable +

                                                                                              The name of the component to enable, or a list of names.

                                                                                              +

                                                                                              + + TYPE: + Optional[Union[str, Iterable[str]]] + + + DEFAULT: + None + +

                                                                                              +
                                                                                              + +
                                                                                              + +
                                                                                              + + + +
                                                                                              + +
                                                                                              + +
                                                                                              + + + + +
                                                                                              + +
                                                                                              + +
                                                                                              +

                                                                                                + + + + + + +
                                                                                                +
                                                                                                + + +
                                                                                                + +
                                                                                                + + + +
                                                                                                +
                                                                                                +
                                                                                                +
                                                                                                + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/aggregators/index.html b/main/reference/edspdf/pipes/aggregators/index.html new file mode 100644 index 00000000..e3f63c85 --- /dev/null +++ b/main/reference/edspdf/pipes/aggregators/index.html @@ -0,0 +1,2360 @@ + + + + + + + + + + + + + + + + + + + + + + aggregators - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                + +
                                                                                                + + + + + + + + +
                                                                                                + + +
                                                                                                + +
                                                                                                + + + + + + +
                                                                                                +
                                                                                                + + + +
                                                                                                +
                                                                                                +
                                                                                                + + + + +
                                                                                                +
                                                                                                +
                                                                                                + + + +
                                                                                                +
                                                                                                +
                                                                                                + + + +
                                                                                                +
                                                                                                +
                                                                                                + + + +
                                                                                                +
                                                                                                + + + + + + + +

                                                                                                edspdf.pipes.aggregators

                                                                                                + + +
                                                                                                + + + + +
                                                                                                + + + +
                                                                                                + + + + + + + + + + + +
                                                                                                + +
                                                                                                + +
                                                                                                +

                                                                                                  + + + + + + +
                                                                                                  +
                                                                                                  + + +
                                                                                                  + +
                                                                                                  + + + +
                                                                                                  +
                                                                                                  +
                                                                                                  +
                                                                                                  + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/aggregators/simple/index.html b/main/reference/edspdf/pipes/aggregators/simple/index.html new file mode 100644 index 00000000..a79fa964 --- /dev/null +++ b/main/reference/edspdf/pipes/aggregators/simple/index.html @@ -0,0 +1,2648 @@ + + + + + + + + + + + + + + + + + + + + + + simple - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                  + +
                                                                                                  + + + + + + + + +
                                                                                                  + + +
                                                                                                  + +
                                                                                                  + + + + + + +
                                                                                                  +
                                                                                                  + + + +
                                                                                                  +
                                                                                                  +
                                                                                                  + + + + +
                                                                                                  +
                                                                                                  +
                                                                                                  + + + +
                                                                                                  +
                                                                                                  +
                                                                                                  + + + +
                                                                                                  +
                                                                                                  +
                                                                                                  + + + +
                                                                                                  +
                                                                                                  + + + + + + + +

                                                                                                  edspdf.pipes.aggregators.simple

                                                                                                  + + +
                                                                                                  + + + + +
                                                                                                  + + + +
                                                                                                  + + + + + + +
                                                                                                  + + + + +

                                                                                                  +SimpleAggregator + +

                                                                                                  + + +
                                                                                                  + + +

                                                                                                  Aggregator that returns texts and styles. It groups all text boxes with the same +label under the aggregated_text, and additionally aggregates the +styles of the text boxes.

                                                                                                  +

                                                                                                  Examples

                                                                                                  +

                                                                                                  Create a pipeline

                                                                                                  +
                                                                                                  +
                                                                                                  +
                                                                                                  +
                                                                                                  pipeline = ...
                                                                                                  +pipeline.add_pipe(
                                                                                                  +    "simple-aggregator",
                                                                                                  +    name="aggregator",
                                                                                                  +    config={
                                                                                                  +        "new_line_threshold": 0.2,
                                                                                                  +        "new_paragraph_threshold": 1.5,
                                                                                                  +        "label_map": {
                                                                                                  +            "body": "text",
                                                                                                  +            "table": "text",
                                                                                                  +        },
                                                                                                  +    },
                                                                                                  +)
                                                                                                  +
                                                                                                  +
                                                                                                  +
                                                                                                  +
                                                                                                  ...
                                                                                                  +
                                                                                                  +[components.aggregator]
                                                                                                  +@factory = "simple-aggregator"
                                                                                                  +new_line_threshold = 0.2
                                                                                                  +new_paragraph_threshold = 1.5
                                                                                                  +label_map = { body = "text", table = "text" }
                                                                                                  +
                                                                                                  +...
                                                                                                  +
                                                                                                  +
                                                                                                  +
                                                                                                  +
                                                                                                  +

                                                                                                  and run it on a document:

                                                                                                  +
                                                                                                  doc = pipeline(doc)
                                                                                                  +print(doc.aggregated_texts)
                                                                                                  +# {
                                                                                                  +#     "text": "This is the body of the document, followed by a table | A | B |"
                                                                                                  +# }
                                                                                                  +
                                                                                                  + +

                                                                                                  Parameters

                                                                                                  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                  PARAMETERDESCRIPTION
                                                                                                  pipeline +

                                                                                                  The pipeline object

                                                                                                  +

                                                                                                  + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                                                                                  +
                                                                                                  name +

                                                                                                  The name of the component

                                                                                                  +

                                                                                                  + + TYPE: + str + + + DEFAULT: + 'simple-aggregator' + +

                                                                                                  +
                                                                                                  sort +

                                                                                                  Whether to sort text boxes inside each label group by (page, y, x) position +before merging them.

                                                                                                  +

                                                                                                  + + TYPE: + bool + + + DEFAULT: + False + +

                                                                                                  +
                                                                                                  new_line_threshold +

                                                                                                  Minimum ratio of the distance between two lines to the median height of +lines to consider them as being on separate lines

                                                                                                  +

                                                                                                  + + TYPE: + float + + + DEFAULT: + 0.2 + +

                                                                                                  +
                                                                                                  new_paragraph_threshold +

                                                                                                  Minimum ratio of the distance between two lines to the median height of +lines to consider them as being on separate paragraphs and thus add a +newline character between them.

                                                                                                  +

                                                                                                  + + TYPE: + float + + + DEFAULT: + 1.5 + +

                                                                                                  +
                                                                                                  label_map +

                                                                                                  A dictionary mapping labels to new labels. This is useful to group labels +together, for instance, to output both "body" and "table" as "text".

                                                                                                  +

                                                                                                  + + TYPE: + Dict + + + DEFAULT: + {} + +

                                                                                                  +
                                                                                                  + + + + + +
                                                                                                  + + + + + + + + + + + +
                                                                                                  + +
                                                                                                  + +
                                                                                                  + + + + +
                                                                                                  + +
                                                                                                  + +
                                                                                                  +

                                                                                                    + + + + + + +
                                                                                                    +
                                                                                                    + + +
                                                                                                    + +
                                                                                                    + + + +
                                                                                                    +
                                                                                                    +
                                                                                                    +
                                                                                                    + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/classifiers/dummy/index.html b/main/reference/edspdf/pipes/classifiers/dummy/index.html new file mode 100644 index 00000000..fd13def3 --- /dev/null +++ b/main/reference/edspdf/pipes/classifiers/dummy/index.html @@ -0,0 +1,2534 @@ + + + + + + + + + + + + + + + + + + + + + + dummy - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                    + +
                                                                                                    + + + + + + + + +
                                                                                                    + + +
                                                                                                    + +
                                                                                                    + + + + + + +
                                                                                                    +
                                                                                                    + + + +
                                                                                                    +
                                                                                                    +
                                                                                                    + + + + +
                                                                                                    +
                                                                                                    +
                                                                                                    + + + +
                                                                                                    +
                                                                                                    +
                                                                                                    + + + +
                                                                                                    +
                                                                                                    +
                                                                                                    + + + +
                                                                                                    +
                                                                                                    + + + + + + + +

                                                                                                    edspdf.pipes.classifiers.dummy

                                                                                                    + + +
                                                                                                    + + + + +
                                                                                                    + + + +
                                                                                                    + + + + + + +
                                                                                                    + + + + +

                                                                                                    +DummyClassifier + +

                                                                                                    + + +
                                                                                                    + + +

                                                                                                    Dummy classifier, for chaos purposes. Classifies each line to a random element.

                                                                                                    + +

                                                                                                    Parameters

                                                                                                    + + + + + + + + + + + + + + + + + + + + + +
                                                                                                    PARAMETERDESCRIPTION
                                                                                                    pipeline +

                                                                                                    The pipeline object.

                                                                                                    +

                                                                                                    + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                                                                                    +
                                                                                                    name +

                                                                                                    The name of the component.

                                                                                                    +

                                                                                                    + + TYPE: + str + + + DEFAULT: + 'dummy-classifier' + +

                                                                                                    +
                                                                                                    label +

                                                                                                    The label to assign to each line.

                                                                                                    +

                                                                                                    + + TYPE: + str + +

                                                                                                    +
                                                                                                    + + + + + +
                                                                                                    + + + + + + + + + + + +
                                                                                                    + +
                                                                                                    + +
                                                                                                    + + + + +
                                                                                                    + +
                                                                                                    + +
                                                                                                    +

                                                                                                      + + + + + + +
                                                                                                      +
                                                                                                      + + +
                                                                                                      + +
                                                                                                      + + + +
                                                                                                      +
                                                                                                      +
                                                                                                      +
                                                                                                      + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/classifiers/index.html b/main/reference/edspdf/pipes/classifiers/index.html new file mode 100644 index 00000000..015141ff --- /dev/null +++ b/main/reference/edspdf/pipes/classifiers/index.html @@ -0,0 +1,2360 @@ + + + + + + + + + + + + + + + + + + + + + + classifiers - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                      + +
                                                                                                      + + + + + + + + +
                                                                                                      + + +
                                                                                                      + +
                                                                                                      + + + + + + +
                                                                                                      +
                                                                                                      + + + +
                                                                                                      +
                                                                                                      +
                                                                                                      + + + + +
                                                                                                      +
                                                                                                      +
                                                                                                      + + + +
                                                                                                      +
                                                                                                      +
                                                                                                      + + + +
                                                                                                      +
                                                                                                      +
                                                                                                      + + + +
                                                                                                      +
                                                                                                      + + + + + + + +

                                                                                                      edspdf.pipes.classifiers

                                                                                                      + + +
                                                                                                      + + + + +
                                                                                                      + + + +
                                                                                                      + + + + + + + + + + + +
                                                                                                      + +
                                                                                                      + +
                                                                                                      +

                                                                                                        + + + + + + +
                                                                                                        +
                                                                                                        + + +
                                                                                                        + +
                                                                                                        + + + +
                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/classifiers/mask/index.html b/main/reference/edspdf/pipes/classifiers/mask/index.html new file mode 100644 index 00000000..d10b995c --- /dev/null +++ b/main/reference/edspdf/pipes/classifiers/mask/index.html @@ -0,0 +1,2850 @@ + + + + + + + + + + + + + + + + + + + + + + mask - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                        + +
                                                                                                        + + + + + + + + +
                                                                                                        + + +
                                                                                                        + +
                                                                                                        + + + + + + +
                                                                                                        +
                                                                                                        + + + +
                                                                                                        +
                                                                                                        +
                                                                                                        + + + + +
                                                                                                        +
                                                                                                        +
                                                                                                        + + + +
                                                                                                        +
                                                                                                        +
                                                                                                        + + + +
                                                                                                        +
                                                                                                        +
                                                                                                        + + + +
                                                                                                        +
                                                                                                        + + + + + + + +

                                                                                                        edspdf.pipes.classifiers.mask

                                                                                                        + + +
                                                                                                        + + + + +
                                                                                                        + + + +
                                                                                                        + + + + + + +
                                                                                                        + + + + +

                                                                                                        +MaskClassifier + +

                                                                                                        + + +
                                                                                                        + + +

                                                                                                        Simple mask classifier, that labels every box inside one of the masks +with its label.

                                                                                                        + + + + + +
                                                                                                        + + + + + + + + + + + +
                                                                                                        + +
                                                                                                        + +
                                                                                                        + + +
                                                                                                        + + + +

                                                                                                        +simple_mask_classifier_factory + +

                                                                                                        + + +
                                                                                                        + +

                                                                                                        The simplest form of mask classification. You define the mask, everything else +is tagged as pollution.

                                                                                                        + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                        PARAMETERDESCRIPTION
                                                                                                        pipeline +

                                                                                                        The pipeline object

                                                                                                        +

                                                                                                        + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                                                                                        +
                                                                                                        name +

                                                                                                        The name of the component

                                                                                                        +

                                                                                                        + + TYPE: + str + + + DEFAULT: + 'mask-classifier' + +

                                                                                                        +
                                                                                                        x0 +

                                                                                                        The x0 coordinate of the mask

                                                                                                        +

                                                                                                        + + TYPE: + float + +

                                                                                                        +
                                                                                                        y0 +

                                                                                                        The y0 coordinate of the mask

                                                                                                        +

                                                                                                        + + TYPE: + float + +

                                                                                                        +
                                                                                                        x1 +

                                                                                                        The x1 coordinate of the mask

                                                                                                        +

                                                                                                        + + TYPE: + float + +

                                                                                                        +
                                                                                                        y1 +

                                                                                                        The y1 coordinate of the mask

                                                                                                        +

                                                                                                        + + TYPE: + float + +

                                                                                                        +
                                                                                                        threshold +

                                                                                                        The threshold for the alignment

                                                                                                        +

                                                                                                        + + TYPE: + float + + + DEFAULT: + 1.0 + +

                                                                                                        +
                                                                                                        +

                                                                                                        Examples

                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        pipeline.add_pipe(
                                                                                                        +    "mask-classifier",
                                                                                                        +    name="classifier",
                                                                                                        +    config={
                                                                                                        +        "threshold": 0.9,
                                                                                                        +        "x0": 0.1,
                                                                                                        +        "y0": 0.1,
                                                                                                        +        "x1": 0.9,
                                                                                                        +        "y1": 0.9,
                                                                                                        +    },
                                                                                                        +)
                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        [components.classifier]
                                                                                                        +@classifiers = "mask-classifier"
                                                                                                        +x0 = 0.1
                                                                                                        +y0 = 0.1
                                                                                                        +x1 = 0.9
                                                                                                        +y1 = 0.9
                                                                                                        +threshold = 0.9
                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        + +
                                                                                                        + +
                                                                                                        + +
                                                                                                        + + + +

                                                                                                        +mask_classifier_factory + +

                                                                                                        + + +
                                                                                                        + +

                                                                                                        A generalisation, wherein the user defines a number of regions.

                                                                                                        +

                                                                                                        The following configuration produces exactly the same classifier as mask.v1 +example above.

                                                                                                        +

                                                                                                        Any bloc that is not part of a mask is tagged as pollution.

                                                                                                        + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                        PARAMETERDESCRIPTION
                                                                                                        pipeline +

                                                                                                        The pipeline object

                                                                                                        +

                                                                                                        + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                                                                                        +
                                                                                                        name + +

                                                                                                        + + TYPE: + str + + + DEFAULT: + 'multi-mask-classifier' + +

                                                                                                        +
                                                                                                        threshold +

                                                                                                        The threshold for the alignment

                                                                                                        +

                                                                                                        + + TYPE: + float + + + DEFAULT: + 1.0 + +

                                                                                                        +
                                                                                                        masks +

                                                                                                        The masks

                                                                                                        +

                                                                                                        + + TYPE: + Box + + + DEFAULT: + {} + +

                                                                                                        +
                                                                                                        +

                                                                                                        Examples

                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        pipeline.add_pipe(
                                                                                                        +    "multi-mask-classifier",
                                                                                                        +    name="classifier",
                                                                                                        +    config={
                                                                                                        +        "threshold": 0.9,
                                                                                                        +        "mymask": {"x0": 0.1, "y0": 0.1, "x1": 0.9, "y1": 0.3, "label": "body"},
                                                                                                        +    },
                                                                                                        +)
                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        [components.classifier]
                                                                                                        +@factory = "multi-mask-classifier"
                                                                                                        +threshold = 0.9
                                                                                                        +
                                                                                                        +[components.classifier.mymask]
                                                                                                        +label = "body"
                                                                                                        +x0 = 0.1
                                                                                                        +y0 = 0.1
                                                                                                        +x1 = 0.9
                                                                                                        +y1 = 0.9
                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        +

                                                                                                        The following configuration defines a header region.

                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        pipeline.add_pipe(
                                                                                                        +    "multi-mask-classifier",
                                                                                                        +    name="classifier",
                                                                                                        +    config={
                                                                                                        +        "threshold": 0.9,
                                                                                                        +        "body": {"x0": 0.1, "y0": 0.1, "x1": 0.9, "y1": 0.3, "label": "header"},
                                                                                                        +        "header": {"x0": 0.1, "y0": 0.3, "x1": 0.9, "y1": 0.9, "label": "body"},
                                                                                                        +    },
                                                                                                        +)
                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        [components.classifier]
                                                                                                        +@factory = "multi-mask-classifier"
                                                                                                        +threshold = 0.9
                                                                                                        +
                                                                                                        +[components.classifier.header]
                                                                                                        +label = "header"
                                                                                                        +x0 = 0.1
                                                                                                        +y0 = 0.1
                                                                                                        +x1 = 0.9
                                                                                                        +y1 = 0.3
                                                                                                        +
                                                                                                        +[components.classifier.body]
                                                                                                        +label = "body"
                                                                                                        +x0 = 0.1
                                                                                                        +y0 = 0.3
                                                                                                        +x1 = 0.9
                                                                                                        +y1 = 0.9
                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        +
                                                                                                        + +
                                                                                                        + +
                                                                                                        + + + +
                                                                                                        + +
                                                                                                        + +
                                                                                                        +

                                                                                                          + + + + + + +
                                                                                                          +
                                                                                                          + + +
                                                                                                          + +
                                                                                                          + + + +
                                                                                                          +
                                                                                                          +
                                                                                                          +
                                                                                                          + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/classifiers/random/index.html b/main/reference/edspdf/pipes/classifiers/random/index.html new file mode 100644 index 00000000..6d97aac2 --- /dev/null +++ b/main/reference/edspdf/pipes/classifiers/random/index.html @@ -0,0 +1,2532 @@ + + + + + + + + + + + + + + + + + + + + + + random - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                          + +
                                                                                                          + + + + + + + + +
                                                                                                          + + +
                                                                                                          + +
                                                                                                          + + + + + + +
                                                                                                          +
                                                                                                          + + + +
                                                                                                          +
                                                                                                          +
                                                                                                          + + + + +
                                                                                                          +
                                                                                                          +
                                                                                                          + + + +
                                                                                                          +
                                                                                                          +
                                                                                                          + + + +
                                                                                                          +
                                                                                                          +
                                                                                                          + + + +
                                                                                                          +
                                                                                                          + + + + + + + +

                                                                                                          edspdf.pipes.classifiers.random

                                                                                                          + + +
                                                                                                          + + + + +
                                                                                                          + + + +
                                                                                                          + + + + + + +
                                                                                                          + + + + +

                                                                                                          +RandomClassifier + +

                                                                                                          + + +
                                                                                                          + + +

                                                                                                          Random classifier, for chaos purposes. Classifies each box to a random element.

                                                                                                          + +

                                                                                                          Parameters

                                                                                                          + + + + + + + + + + + + + + + + + + + + + +
                                                                                                          PARAMETERDESCRIPTION
                                                                                                          pipeline +

                                                                                                          The pipeline object.

                                                                                                          +

                                                                                                          + + TYPE: + Pipeline + +

                                                                                                          +
                                                                                                          name +

                                                                                                          The name of the component.

                                                                                                          +

                                                                                                          + + TYPE: + str + + + DEFAULT: + 'random-classifier' + +

                                                                                                          +
                                                                                                          labels +

                                                                                                          The labels to assign to each line. If a list is passed, each label is assigned +with equal probability. If a dict is passed, the keys are the labels and the +values are the probabilities.

                                                                                                          +

                                                                                                          + + TYPE: + Union[List[str], Dict[str, float]] + +

                                                                                                          +
                                                                                                          + + + + + +
                                                                                                          + + + + + + + + + + + +
                                                                                                          + +
                                                                                                          + +
                                                                                                          + + + + +
                                                                                                          + +
                                                                                                          + +
                                                                                                          +

                                                                                                            + + + + + + +
                                                                                                            +
                                                                                                            + + +
                                                                                                            + +
                                                                                                            + + + +
                                                                                                            +
                                                                                                            +
                                                                                                            +
                                                                                                            + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/classifiers/trainable/index.html b/main/reference/edspdf/pipes/classifiers/trainable/index.html new file mode 100644 index 00000000..a4d1f1d5 --- /dev/null +++ b/main/reference/edspdf/pipes/classifiers/trainable/index.html @@ -0,0 +1,2590 @@ + + + + + + + + + + + + + + + + + + + + + + trainable - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                            + +
                                                                                                            + + + + + + + + +
                                                                                                            + + +
                                                                                                            + +
                                                                                                            + + + + + + +
                                                                                                            +
                                                                                                            + + + +
                                                                                                            +
                                                                                                            +
                                                                                                            + + + + +
                                                                                                            +
                                                                                                            +
                                                                                                            + + + +
                                                                                                            +
                                                                                                            +
                                                                                                            + + + +
                                                                                                            +
                                                                                                            +
                                                                                                            + + + +
                                                                                                            +
                                                                                                            + + + + + + + +

                                                                                                            edspdf.pipes.classifiers.trainable

                                                                                                            + + +
                                                                                                            + + + + +
                                                                                                            + + + +
                                                                                                            + + + + + + +
                                                                                                            + + + + +

                                                                                                            +TrainableClassifier + +

                                                                                                            + + +
                                                                                                            +

                                                                                                            + Bases: TrainablePipe[Dict[str, Any]]

                                                                                                            + + +

                                                                                                            This component predicts a label for each box over the whole document using machine +learning.

                                                                                                            +
                                                                                                            +

                                                                                                            Note

                                                                                                            +

                                                                                                            You must train the model your model to use this classifier. +See Model training for more information

                                                                                                            +
                                                                                                            +

                                                                                                            Examples

                                                                                                            +

                                                                                                            The classifier is composed of the following blocks:

                                                                                                            +
                                                                                                              +
                                                                                                            • a configurable box embedding layer
                                                                                                            • +
                                                                                                            • a linear classification layer
                                                                                                            • +
                                                                                                            +

                                                                                                            In this example, we use a box-embedding layer to generate the embeddings +of the boxes. It is composed of a text encoder that embeds the text features of the +boxes and a layout encoder that embeds the layout features of the boxes. +These two embeddings are summed and passed through an optional contextualizer, +here a box-transformer.

                                                                                                            +
                                                                                                            +
                                                                                                            +
                                                                                                            +
                                                                                                            pipeline.add_pipe(
                                                                                                            +    "trainable-classifier",
                                                                                                            +    name="classifier",
                                                                                                            +    config={
                                                                                                            +        # simple embedding computed by pooling embeddings of words in each box
                                                                                                            +        "embedding": {
                                                                                                            +            "@factory": "sub-box-cnn-pooler",
                                                                                                            +            "out_channels": 64,
                                                                                                            +            "kernel_sizes": (3, 4, 5),
                                                                                                            +            "embedding": {
                                                                                                            +                "@factory": "simple-text-embedding",
                                                                                                            +                "size": 72,
                                                                                                            +            },
                                                                                                            +        },
                                                                                                            +        "labels": ["body", "pollution"],
                                                                                                            +    },
                                                                                                            +)
                                                                                                            +
                                                                                                            +
                                                                                                            +
                                                                                                            +
                                                                                                            [components.classifier]
                                                                                                            +@factory = "trainable-classifier"
                                                                                                            +labels = ["body", "pollution"]
                                                                                                            +
                                                                                                            +[components.classifier.embedding]
                                                                                                            +@factory = "sub-box-cnn-pooler"
                                                                                                            +out_channels = 64
                                                                                                            +kernel_sizes = (3, 4, 5)
                                                                                                            +
                                                                                                            +[components.classifier.embedding.embedding]
                                                                                                            +@factory = "simple-text-embedding"
                                                                                                            +size = 72
                                                                                                            +
                                                                                                            +
                                                                                                            +
                                                                                                            +
                                                                                                            + +

                                                                                                            Parameters

                                                                                                            + + + + + + + + + + + + + + + + + +
                                                                                                            PARAMETERDESCRIPTION
                                                                                                            labels +

                                                                                                            Initial labels of the classifier (will be completed during initialization)

                                                                                                            +

                                                                                                            + + TYPE: + Sequence[str] + + + DEFAULT: + ('pollution') + +

                                                                                                            +
                                                                                                            embedding +

                                                                                                            Embedding module to encode the PDF boxes

                                                                                                            +

                                                                                                            + + TYPE: + TrainablePipe[EmbeddingOutput] + +

                                                                                                            +
                                                                                                            + + + + + +
                                                                                                            + + + + + + + + + + + +
                                                                                                            + +
                                                                                                            + +
                                                                                                            + + + + +
                                                                                                            + +
                                                                                                            + +
                                                                                                            +

                                                                                                              + + + + + + +
                                                                                                              +
                                                                                                              + + +
                                                                                                              + +
                                                                                                              + + + +
                                                                                                              +
                                                                                                              +
                                                                                                              +
                                                                                                              + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/embeddings/box_layout_embedding/index.html b/main/reference/edspdf/pipes/embeddings/box_layout_embedding/index.html new file mode 100644 index 00000000..ef3180be --- /dev/null +++ b/main/reference/edspdf/pipes/embeddings/box_layout_embedding/index.html @@ -0,0 +1,2588 @@ + + + + + + + + + + + + + + + + + + + + + + box_layout_embedding - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                              + +
                                                                                                              + + + + + + + + +
                                                                                                              + + +
                                                                                                              + +
                                                                                                              + + + + + + +
                                                                                                              +
                                                                                                              + + + +
                                                                                                              +
                                                                                                              +
                                                                                                              + + + + +
                                                                                                              +
                                                                                                              +
                                                                                                              + + + +
                                                                                                              +
                                                                                                              +
                                                                                                              + + + +
                                                                                                              +
                                                                                                              +
                                                                                                              + + + +
                                                                                                              +
                                                                                                              + + + + + + + +

                                                                                                              edspdf.pipes.embeddings.box_layout_embedding

                                                                                                              + + +
                                                                                                              + + + + +
                                                                                                              + + + +
                                                                                                              + + + + + + +
                                                                                                              + + + + +

                                                                                                              +BoxLayoutEmbedding + +

                                                                                                              + + +
                                                                                                              +

                                                                                                              + Bases: TrainablePipe[EmbeddingOutput]

                                                                                                              + + +

                                                                                                              This component encodes the geometrical features of a box, as extracted by the +BoxLayoutPreprocessor module, into an embedding. For position modes, use:

                                                                                                              +
                                                                                                                +
                                                                                                              • "sin" to embed positions with a fixed + SinusoidalEmbedding
                                                                                                              • +
                                                                                                              • "learned" to embed positions using a learned standard pytorch embedding layer
                                                                                                              • +
                                                                                                              +

                                                                                                              Each produces embedding is the concatenation of the box width, height and the top, +left, bottom and right coordinates, each embedded depending on the *_mode param.

                                                                                                              + +

                                                                                                              Parameters

                                                                                                              + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                              PARAMETERDESCRIPTION
                                                                                                              size +

                                                                                                              Size of the output box embedding

                                                                                                              +

                                                                                                              + + TYPE: + int + +

                                                                                                              +
                                                                                                              n_positions +

                                                                                                              Number of position embeddings stored in the PositionEmbedding module

                                                                                                              +

                                                                                                              + + TYPE: + int + +

                                                                                                              +
                                                                                                              x_mode +

                                                                                                              Position embedding mode of the x coordinates

                                                                                                              +

                                                                                                              + + TYPE: + Literal['sin', 'learned'] + + + DEFAULT: + 'sin' + +

                                                                                                              +
                                                                                                              y_mode +

                                                                                                              Position embedding mode of the x coordinates

                                                                                                              +

                                                                                                              + + TYPE: + Literal['sin', 'learned'] + + + DEFAULT: + 'sin' + +

                                                                                                              +
                                                                                                              w_mode +

                                                                                                              Position embedding mode of the width features

                                                                                                              +

                                                                                                              + + TYPE: + Literal['sin', 'learned'] + + + DEFAULT: + 'sin' + +

                                                                                                              +
                                                                                                              h_mode +

                                                                                                              Position embedding mode of the height features

                                                                                                              +

                                                                                                              + + TYPE: + Literal['sin', 'learned'] + + + DEFAULT: + 'sin' + +

                                                                                                              +
                                                                                                              + + + + + +
                                                                                                              + + + + + + + + + + + +
                                                                                                              + +
                                                                                                              + +
                                                                                                              + + + + +
                                                                                                              + +
                                                                                                              + +
                                                                                                              +

                                                                                                                + + + + + + +
                                                                                                                +
                                                                                                                + + +
                                                                                                                + +
                                                                                                                + + + +
                                                                                                                +
                                                                                                                +
                                                                                                                +
                                                                                                                + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/embeddings/box_layout_preprocessor/index.html b/main/reference/edspdf/pipes/embeddings/box_layout_preprocessor/index.html new file mode 100644 index 00000000..db44f775 --- /dev/null +++ b/main/reference/edspdf/pipes/embeddings/box_layout_preprocessor/index.html @@ -0,0 +1,2472 @@ + + + + + + + + + + + + + + + + + + + + + + box_layout_preprocessor - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                + +
                                                                                                                + + + + + + + + +
                                                                                                                + + +
                                                                                                                + +
                                                                                                                + + + + + + +
                                                                                                                +
                                                                                                                + + + +
                                                                                                                +
                                                                                                                +
                                                                                                                + + + + +
                                                                                                                +
                                                                                                                +
                                                                                                                + + + +
                                                                                                                +
                                                                                                                +
                                                                                                                + + + +
                                                                                                                +
                                                                                                                +
                                                                                                                + + + +
                                                                                                                +
                                                                                                                + + + + + + + +

                                                                                                                edspdf.pipes.embeddings.box_layout_preprocessor

                                                                                                                + + +
                                                                                                                + + + + +
                                                                                                                + + + +
                                                                                                                + + + + + + +
                                                                                                                + + + + +

                                                                                                                +BoxLayoutPreprocessor + +

                                                                                                                + + +
                                                                                                                +

                                                                                                                + Bases: TrainablePipe[BoxLayoutBatch]

                                                                                                                + + +

                                                                                                                The box preprocessor is singleton since its is not configurable. +The following features of each box of an input PDFDoc document are encoded +as 1D tensors:

                                                                                                                +
                                                                                                                  +
                                                                                                                • boxes_page: page index of the box
                                                                                                                • +
                                                                                                                • boxes_first_page: is the box on the first page
                                                                                                                • +
                                                                                                                • boxes_last_page: is the box on the last page
                                                                                                                • +
                                                                                                                • boxes_xmin: left position of the box
                                                                                                                • +
                                                                                                                • boxes_ymin: bottom position of the box
                                                                                                                • +
                                                                                                                • boxes_xmax: right position of the box
                                                                                                                • +
                                                                                                                • boxes_ymax: top position of the box
                                                                                                                • +
                                                                                                                • boxes_w: width position of the box
                                                                                                                • +
                                                                                                                • boxes_h: height position of the box
                                                                                                                • +
                                                                                                                +

                                                                                                                The preprocessor also returns an additional tensors:

                                                                                                                +
                                                                                                                  +
                                                                                                                • page_boxes_id: box indices per page to index the + above 1D tensors (LongTensor: n_pages * n_boxes)
                                                                                                                • +
                                                                                                                + + + + + +
                                                                                                                + + + + + + + + + + + +
                                                                                                                + +
                                                                                                                + +
                                                                                                                + + + + +
                                                                                                                + +
                                                                                                                + +
                                                                                                                +

                                                                                                                  + + + + + + +
                                                                                                                  +
                                                                                                                  + + +
                                                                                                                  + +
                                                                                                                  + + + +
                                                                                                                  +
                                                                                                                  +
                                                                                                                  +
                                                                                                                  + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/embeddings/box_transformer/index.html b/main/reference/edspdf/pipes/embeddings/box_transformer/index.html new file mode 100644 index 00000000..f11e5467 --- /dev/null +++ b/main/reference/edspdf/pipes/embeddings/box_transformer/index.html @@ -0,0 +1,2668 @@ + + + + + + + + + + + + + + + + + + + + + + box_transformer - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                  + +
                                                                                                                  + + + + + + + + +
                                                                                                                  + + +
                                                                                                                  + +
                                                                                                                  + + + + + + +
                                                                                                                  +
                                                                                                                  + + + +
                                                                                                                  +
                                                                                                                  +
                                                                                                                  + + + + +
                                                                                                                  +
                                                                                                                  +
                                                                                                                  + + + +
                                                                                                                  +
                                                                                                                  +
                                                                                                                  + + + +
                                                                                                                  +
                                                                                                                  +
                                                                                                                  + + + +
                                                                                                                  +
                                                                                                                  + + + + + + + +

                                                                                                                  edspdf.pipes.embeddings.box_transformer

                                                                                                                  + + +
                                                                                                                  + + + + +
                                                                                                                  + + + +
                                                                                                                  + + + + + + +
                                                                                                                  + + + + +

                                                                                                                  +BoxTransformer + +

                                                                                                                  + + +
                                                                                                                  +

                                                                                                                  + Bases: TrainablePipe[EmbeddingOutput]

                                                                                                                  + + +

                                                                                                                  BoxTransformer using +BoxTransformerModule +under the hood.

                                                                                                                  +
                                                                                                                  +

                                                                                                                  Note

                                                                                                                  +

                                                                                                                  This module is a TrainablePipe +and can be used in a Pipeline, while +BoxTransformerModule +is a standard PyTorch module, which does not take care of the +preprocessing, collating, etc. of the input documents.

                                                                                                                  +
                                                                                                                  + +

                                                                                                                  Parameters

                                                                                                                  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                  PARAMETERDESCRIPTION
                                                                                                                  pipeline +

                                                                                                                  Pipeline instance

                                                                                                                  +

                                                                                                                  + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                                                                                                  +
                                                                                                                  name +

                                                                                                                  Name of the component

                                                                                                                  +

                                                                                                                  + + TYPE: + str + + + DEFAULT: + 'box-transformer' + +

                                                                                                                  +
                                                                                                                  num_heads +

                                                                                                                  Number of attention heads in the attention layers

                                                                                                                  +

                                                                                                                  + + TYPE: + int + + + DEFAULT: + 2 + +

                                                                                                                  +
                                                                                                                  n_relative_positions +

                                                                                                                  Maximum range of embeddable relative positions between boxes (further +distances are capped to ±n_relative_positions // 2)

                                                                                                                  +

                                                                                                                  + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                                                                                                  +
                                                                                                                  dropout_p +

                                                                                                                  Dropout probability both for the attention layers and embedding projections

                                                                                                                  +

                                                                                                                  + + TYPE: + float + + + DEFAULT: + 0.0 + +

                                                                                                                  +
                                                                                                                  head_size +

                                                                                                                  Head sizes of the attention layers

                                                                                                                  +

                                                                                                                  + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                                                                                                  +
                                                                                                                  activation +

                                                                                                                  Activation function used in the linear->activation->linear transformations

                                                                                                                  +

                                                                                                                  + + TYPE: + ActivationFunction + + + DEFAULT: + 'gelu' + +

                                                                                                                  +
                                                                                                                  init_resweight +

                                                                                                                  Initial weight of the residual gates. +At 0, the layer acts (initially) as an identity function, and at 1 as +a standard Transformer layer. +Initializing with a value close to 0 can help the training converge.

                                                                                                                  +

                                                                                                                  + + TYPE: + float + + + DEFAULT: + 0.0 + +

                                                                                                                  +
                                                                                                                  attention_mode +

                                                                                                                  Mode of relative position infused attention layer. +See the relative attention +documentation for more information.

                                                                                                                  +

                                                                                                                  + + TYPE: + Sequence[Literal['c2c', 'c2p', 'p2c']] + + + DEFAULT: + ('c2c', 'c2p', 'p2c') + +

                                                                                                                  +
                                                                                                                  n_layers +

                                                                                                                  Number of layers in the Transformer

                                                                                                                  +

                                                                                                                  + + TYPE: + int + + + DEFAULT: + 2 + +

                                                                                                                  +
                                                                                                                  + + + + + +
                                                                                                                  + + + + + + + + + + + +
                                                                                                                  + +
                                                                                                                  + +
                                                                                                                  + + + + +
                                                                                                                  + +
                                                                                                                  + +
                                                                                                                  +

                                                                                                                    + + + + + + +
                                                                                                                    +
                                                                                                                    + + +
                                                                                                                    + +
                                                                                                                    + + + +
                                                                                                                    +
                                                                                                                    +
                                                                                                                    +
                                                                                                                    + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/embeddings/embedding_combiner/index.html b/main/reference/edspdf/pipes/embeddings/embedding_combiner/index.html new file mode 100644 index 00000000..59d5ab24 --- /dev/null +++ b/main/reference/edspdf/pipes/embeddings/embedding_combiner/index.html @@ -0,0 +1,2577 @@ + + + + + + + + + + + + + + + + + + + + + + embedding_combiner - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                    + +
                                                                                                                    + + + + + + + + +
                                                                                                                    + + +
                                                                                                                    + +
                                                                                                                    + + + + + + +
                                                                                                                    +
                                                                                                                    + + + +
                                                                                                                    +
                                                                                                                    +
                                                                                                                    + + + + +
                                                                                                                    +
                                                                                                                    +
                                                                                                                    + + + +
                                                                                                                    +
                                                                                                                    +
                                                                                                                    + + + +
                                                                                                                    +
                                                                                                                    +
                                                                                                                    + + + +
                                                                                                                    +
                                                                                                                    + + + + + + + +

                                                                                                                    edspdf.pipes.embeddings.embedding_combiner

                                                                                                                    + + +
                                                                                                                    + + + + +
                                                                                                                    + + + +
                                                                                                                    + + + + + + +
                                                                                                                    + + + + +

                                                                                                                    +EmbeddingCombiner + +

                                                                                                                    + + +
                                                                                                                    +

                                                                                                                    + Bases: TrainablePipe[EmbeddingOutput]

                                                                                                                    + + + +

                                                                                                                    Encodes boxes using a combination of multiple encoders

                                                                                                                    + +

                                                                                                                    Parameters

                                                                                                                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                    PARAMETERDESCRIPTION
                                                                                                                    pipeline +

                                                                                                                    The pipeline object

                                                                                                                    +

                                                                                                                    + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                                                                                                    +
                                                                                                                    name +

                                                                                                                    The name of the pipe

                                                                                                                    +

                                                                                                                    + + TYPE: + str + + + DEFAULT: + 'embedding-combiner' + +

                                                                                                                    +
                                                                                                                    mode +

                                                                                                                    The mode to use to combine the encoders:

                                                                                                                    +
                                                                                                                      +
                                                                                                                    • sum: Sum the outputs of the encoders
                                                                                                                    • +
                                                                                                                    • cat: Concatenate the outputs of the encoders
                                                                                                                    • +
                                                                                                                    +

                                                                                                                    + + TYPE: + Literal['sum', 'cat'] + + + DEFAULT: + 'sum' + +

                                                                                                                    +
                                                                                                                    dropout_p +

                                                                                                                    Dropout probability used on the output of the box and textual encoders

                                                                                                                    +

                                                                                                                    + + TYPE: + float + + + DEFAULT: + 0.0 + +

                                                                                                                    +
                                                                                                                    encoders +

                                                                                                                    The encoders to use. The keys are the names of the encoders and the values +are the encoders themselves.

                                                                                                                    +

                                                                                                                    + + TYPE: + TrainablePipe[EmbeddingOutput] + + + DEFAULT: + {} + +

                                                                                                                    +
                                                                                                                    + + + + +
                                                                                                                    + + + + + + + + + + + +
                                                                                                                    + +
                                                                                                                    + +
                                                                                                                    + + + + +
                                                                                                                    + +
                                                                                                                    + +
                                                                                                                    +

                                                                                                                      + + + + + + +
                                                                                                                      +
                                                                                                                      + + +
                                                                                                                      + +
                                                                                                                      + + + +
                                                                                                                      +
                                                                                                                      +
                                                                                                                      +
                                                                                                                      + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/embeddings/huggingface_embedding/index.html b/main/reference/edspdf/pipes/embeddings/huggingface_embedding/index.html new file mode 100644 index 00000000..8505b7a7 --- /dev/null +++ b/main/reference/edspdf/pipes/embeddings/huggingface_embedding/index.html @@ -0,0 +1,2703 @@ + + + + + + + + + + + + + + + + + + + + + + huggingface_embedding - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                      + +
                                                                                                                      + + + + + + + + +
                                                                                                                      + + +
                                                                                                                      + +
                                                                                                                      + + + + + + +
                                                                                                                      +
                                                                                                                      + + + +
                                                                                                                      +
                                                                                                                      +
                                                                                                                      + + + + +
                                                                                                                      +
                                                                                                                      +
                                                                                                                      + + + +
                                                                                                                      +
                                                                                                                      +
                                                                                                                      + + + +
                                                                                                                      +
                                                                                                                      +
                                                                                                                      + + + +
                                                                                                                      +
                                                                                                                      + + + + + + + +

                                                                                                                      edspdf.pipes.embeddings.huggingface_embedding

                                                                                                                      + + +
                                                                                                                      + + + + +
                                                                                                                      + + + +
                                                                                                                      + + + + + + +
                                                                                                                      + + + + +

                                                                                                                      +HuggingfaceEmbedding + +

                                                                                                                      + + +
                                                                                                                      +

                                                                                                                      + Bases: TrainablePipe[EmbeddingOutput]

                                                                                                                      + + +

                                                                                                                      The HuggingfaceEmbeddings component is a wrapper around the Huggingface multi-modal +models. Such pre-trained models should offer better results than a model trained +from scratch. Compared to using the raw Huggingface model, we offer a simple +mechanism to split long documents into strided windows before feeding them to the +model.

                                                                                                                      +

                                                                                                                      Windowing

                                                                                                                      +

                                                                                                                      The HuggingfaceEmbedding component splits long documents into smaller windows before +feeding them to the model. This is done to avoid hitting the maximum number of +tokens that can be processed by the model on a single device. The window size and +stride can be configured using the window and stride parameters. The default +values are 510 and 255 respectively, which means that the model will process windows +of 510 tokens, each separated by 255 tokens. Whenever a token appears in multiple +windows, the embedding of the "most contextualized" occurrence is used, i.e. the +occurrence that is the closest to the center of its window.

                                                                                                                      +

                                                                                                                      Here is an overview how this works in a classifier model : +Transformer windowing

                                                                                                                      +

                                                                                                                      Examples

                                                                                                                      +

                                                                                                                      Here is an example of how to define a pipeline with the HuggingfaceEmbedding +component:

                                                                                                                      +
                                                                                                                      from edspdf import Pipeline
                                                                                                                      +
                                                                                                                      +model = Pipeline()
                                                                                                                      +model.add_pipe(
                                                                                                                      +    "pdfminer-extractor",
                                                                                                                      +    name="extractor",
                                                                                                                      +    config={
                                                                                                                      +        "render_pages": True,
                                                                                                                      +    },
                                                                                                                      +)
                                                                                                                      +model.add_pipe(
                                                                                                                      +    "huggingface-embedding",
                                                                                                                      +    name="embedding",
                                                                                                                      +    config={
                                                                                                                      +        "model": "microsoft/layoutlmv3-base",
                                                                                                                      +        "use_image": False,
                                                                                                                      +        "window": 128,
                                                                                                                      +        "stride": 64,
                                                                                                                      +        "line_pooling": "mean",
                                                                                                                      +    },
                                                                                                                      +)
                                                                                                                      +model.add_pipe(
                                                                                                                      +    "trainable-classifier",
                                                                                                                      +    name="classifier",
                                                                                                                      +    config={
                                                                                                                      +        "embedding": model.get_pipe("embedding"),
                                                                                                                      +        "labels": [],
                                                                                                                      +    },
                                                                                                                      +)
                                                                                                                      +
                                                                                                                      +

                                                                                                                      This model can then be trained following the +training recipe.

                                                                                                                      + +

                                                                                                                      Parameters

                                                                                                                      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                      PARAMETERDESCRIPTION
                                                                                                                      pipeline +

                                                                                                                      The pipeline instance

                                                                                                                      +

                                                                                                                      + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                                                                                                      +
                                                                                                                      name +

                                                                                                                      The component name

                                                                                                                      +

                                                                                                                      + + TYPE: + str + + + DEFAULT: + 'huggingface-embedding' + +

                                                                                                                      +
                                                                                                                      model +

                                                                                                                      The Huggingface model name or path

                                                                                                                      +

                                                                                                                      + + TYPE: + str + + + DEFAULT: + None + +

                                                                                                                      +
                                                                                                                      use_image +

                                                                                                                      Whether to use the image or not in the model

                                                                                                                      +

                                                                                                                      + + TYPE: + bool + + + DEFAULT: + True + +

                                                                                                                      +
                                                                                                                      window +

                                                                                                                      The window size to use when splitting long documents into smaller windows +before feeding them to the Transformer model (default: 510 = 512 - 2)

                                                                                                                      +

                                                                                                                      + + TYPE: + int + + + DEFAULT: + 510 + +

                                                                                                                      +
                                                                                                                      stride +

                                                                                                                      The stride (distance between windows) to use when splitting long documents into +smaller windows: (default: 510 / 2 = 255)

                                                                                                                      +

                                                                                                                      + + TYPE: + int + + + DEFAULT: + 255 + +

                                                                                                                      +
                                                                                                                      line_pooling +

                                                                                                                      The pooling strategy to use when combining the embeddings of the tokens in a +line into a single line embedding

                                                                                                                      +

                                                                                                                      + + TYPE: + Literal['mean', 'max', 'sum'] + + + DEFAULT: + 'mean' + +

                                                                                                                      +
                                                                                                                      max_tokens_per_device +

                                                                                                                      The maximum number of tokens that can be processed by the model on a single +device. This does not affect the results but can be used to reduce the memory +usage of the model, at the cost of a longer processing time.

                                                                                                                      +

                                                                                                                      + + TYPE: + int + + + DEFAULT: + 128 * 128 + +

                                                                                                                      +
                                                                                                                      + + + + + +
                                                                                                                      + + + + + + + + + + + +
                                                                                                                      + +
                                                                                                                      + +
                                                                                                                      + + + + +
                                                                                                                      + +
                                                                                                                      + +
                                                                                                                      +

                                                                                                                        + + + + + + +
                                                                                                                        +
                                                                                                                        + + +
                                                                                                                        + +
                                                                                                                        + + + +
                                                                                                                        +
                                                                                                                        +
                                                                                                                        +
                                                                                                                        + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/embeddings/index.html b/main/reference/edspdf/pipes/embeddings/index.html new file mode 100644 index 00000000..a720f6c0 --- /dev/null +++ b/main/reference/edspdf/pipes/embeddings/index.html @@ -0,0 +1,2360 @@ + + + + + + + + + + + + + + + + + + + + + + embeddings - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                        + +
                                                                                                                        + + + + + + + + +
                                                                                                                        + + +
                                                                                                                        + +
                                                                                                                        + + + + + + +
                                                                                                                        +
                                                                                                                        + + + +
                                                                                                                        +
                                                                                                                        +
                                                                                                                        + + + + +
                                                                                                                        +
                                                                                                                        +
                                                                                                                        + + + +
                                                                                                                        +
                                                                                                                        +
                                                                                                                        + + + +
                                                                                                                        +
                                                                                                                        +
                                                                                                                        + + + +
                                                                                                                        +
                                                                                                                        + + + + + + + +

                                                                                                                        edspdf.pipes.embeddings

                                                                                                                        + + +
                                                                                                                        + + + + +
                                                                                                                        + + + +
                                                                                                                        + + + + + + + + + + + +
                                                                                                                        + +
                                                                                                                        + +
                                                                                                                        +

                                                                                                                          + + + + + + +
                                                                                                                          +
                                                                                                                          + + +
                                                                                                                          + +
                                                                                                                          + + + +
                                                                                                                          +
                                                                                                                          +
                                                                                                                          +
                                                                                                                          + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/embeddings/simple_text_embedding/index.html b/main/reference/edspdf/pipes/embeddings/simple_text_embedding/index.html new file mode 100644 index 00000000..9f1680cf --- /dev/null +++ b/main/reference/edspdf/pipes/embeddings/simple_text_embedding/index.html @@ -0,0 +1,2631 @@ + + + + + + + + + + + + + + + + + + + + + + simple_text_embedding - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                          + +
                                                                                                                          + + + + + + + + +
                                                                                                                          + + +
                                                                                                                          + +
                                                                                                                          + + + + + + +
                                                                                                                          +
                                                                                                                          + + + +
                                                                                                                          +
                                                                                                                          +
                                                                                                                          + + + + +
                                                                                                                          +
                                                                                                                          +
                                                                                                                          + + + +
                                                                                                                          +
                                                                                                                          +
                                                                                                                          + + + +
                                                                                                                          +
                                                                                                                          +
                                                                                                                          + + + +
                                                                                                                          +
                                                                                                                          + + + + + + + +

                                                                                                                          edspdf.pipes.embeddings.simple_text_embedding

                                                                                                                          + + +
                                                                                                                          + + + + +
                                                                                                                          + + + +
                                                                                                                          + + + + + + +
                                                                                                                          + + + + +

                                                                                                                          +SimpleTextEmbedding + +

                                                                                                                          + + +
                                                                                                                          +

                                                                                                                          + Bases: TrainablePipe[EmbeddingOutput]

                                                                                                                          + + +

                                                                                                                          A module that embeds the textual features of the blocks

                                                                                                                          + + + +

                                                                                                                          Parameters

                                                                                                                          + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                          PARAMETERDESCRIPTION
                                                                                                                          size +

                                                                                                                          Size of the output box embedding

                                                                                                                          +

                                                                                                                          + + TYPE: + int + +

                                                                                                                          +
                                                                                                                          pipeline +

                                                                                                                          The pipeline object

                                                                                                                          +

                                                                                                                          + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                                                                                                          +
                                                                                                                          name +

                                                                                                                          Name of the component

                                                                                                                          +

                                                                                                                          + + TYPE: + str + + + DEFAULT: + 'simple-text-embedding' + +

                                                                                                                          +
                                                                                                                          + + + + +
                                                                                                                          + + + + + + + + + + + +
                                                                                                                          + +
                                                                                                                          + +
                                                                                                                          + + +
                                                                                                                          + + + +

                                                                                                                          +word_shape + +

                                                                                                                          + + +
                                                                                                                          + +

                                                                                                                          Converts a word into its shape following the algorithm used in the +spaCy library.

                                                                                                                          +

                                                                                                                          https://github.com/explosion/spaCy/blob/b69d249a/spacy/lang/lex_attrs.py#L118

                                                                                                                          + + + + + + + + + + + + + + +
                                                                                                                          PARAMETERDESCRIPTION
                                                                                                                          text + +

                                                                                                                          + + TYPE: + str + +

                                                                                                                          +
                                                                                                                          + + + + + + + + + + + + + + + + + + + + +
                                                                                                                          RETURNSDESCRIPTION
                                                                                                                          + + str + + +
                                                                                                                          + +
                                                                                                                          +
                                                                                                                          + + The word shape + + +
                                                                                                                          + +
                                                                                                                          +
                                                                                                                          + +
                                                                                                                          + +
                                                                                                                          + + + +
                                                                                                                          + +
                                                                                                                          + +
                                                                                                                          +

                                                                                                                            + + + + + + +
                                                                                                                            +
                                                                                                                            + + +
                                                                                                                            + +
                                                                                                                            + + + +
                                                                                                                            +
                                                                                                                            +
                                                                                                                            +
                                                                                                                            + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/embeddings/sub_box_cnn_pooler/index.html b/main/reference/edspdf/pipes/embeddings/sub_box_cnn_pooler/index.html new file mode 100644 index 00000000..cbae621b --- /dev/null +++ b/main/reference/edspdf/pipes/embeddings/sub_box_cnn_pooler/index.html @@ -0,0 +1,2593 @@ + + + + + + + + + + + + + + + + + + + + + + sub_box_cnn_pooler - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                            + +
                                                                                                                            + + + + + + + + +
                                                                                                                            + + +
                                                                                                                            + +
                                                                                                                            + + + + + + +
                                                                                                                            +
                                                                                                                            + + + +
                                                                                                                            +
                                                                                                                            +
                                                                                                                            + + + + +
                                                                                                                            +
                                                                                                                            +
                                                                                                                            + + + +
                                                                                                                            +
                                                                                                                            +
                                                                                                                            + + + +
                                                                                                                            +
                                                                                                                            +
                                                                                                                            + + + +
                                                                                                                            +
                                                                                                                            + + + + + + + +

                                                                                                                            edspdf.pipes.embeddings.sub_box_cnn_pooler

                                                                                                                            + + +
                                                                                                                            + + + + +
                                                                                                                            + + + +
                                                                                                                            + + + + + + +
                                                                                                                            + + + + +

                                                                                                                            +SubBoxCNNPooler + +

                                                                                                                            + + +
                                                                                                                            +

                                                                                                                            + Bases: TrainablePipe[EmbeddingOutput]

                                                                                                                            + + +

                                                                                                                            One dimension CNN encoding multi-kernel layer. +Input embeddings are convoluted using linear kernels each parametrized with +a (window) size of kernel_size[kernel_i] +The output of the kernels are concatenated together, max-pooled and finally +projected to a size of output_size.

                                                                                                                            + +

                                                                                                                            Parameters

                                                                                                                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                            PARAMETERDESCRIPTION
                                                                                                                            pipeline +

                                                                                                                            Pipeline instance

                                                                                                                            +

                                                                                                                            + + TYPE: + Pipeline + + + DEFAULT: + None + +

                                                                                                                            +
                                                                                                                            name +

                                                                                                                            Name of the component

                                                                                                                            +

                                                                                                                            + + TYPE: + str + + + DEFAULT: + 'sub-box-cnn-pooler' + +

                                                                                                                            +
                                                                                                                            output_size +

                                                                                                                            Size of the output embeddings +Defaults to the input_size

                                                                                                                            +

                                                                                                                            + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                                                                                                            +
                                                                                                                            out_channels +

                                                                                                                            Number of channels

                                                                                                                            +

                                                                                                                            + + TYPE: + Optional[int] + + + DEFAULT: + None + +

                                                                                                                            +
                                                                                                                            kernel_sizes +

                                                                                                                            Window size of each kernel

                                                                                                                            +

                                                                                                                            + + TYPE: + Sequence[int] + + + DEFAULT: + (3, 4, 5) + +

                                                                                                                            +
                                                                                                                            activation +

                                                                                                                            Activation function to use

                                                                                                                            +

                                                                                                                            + + TYPE: + ActivationFunction + + + DEFAULT: + 'relu' + +

                                                                                                                            +
                                                                                                                            + + + + + +
                                                                                                                            + + + + + + + + + + + +
                                                                                                                            + +
                                                                                                                            + +
                                                                                                                            + + + + +
                                                                                                                            + +
                                                                                                                            + +
                                                                                                                            +

                                                                                                                              + + + + + + +
                                                                                                                              +
                                                                                                                              + + +
                                                                                                                              + +
                                                                                                                              + + + +
                                                                                                                              +
                                                                                                                              +
                                                                                                                              +
                                                                                                                              + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/extractors/index.html b/main/reference/edspdf/pipes/extractors/index.html new file mode 100644 index 00000000..9145a4e1 --- /dev/null +++ b/main/reference/edspdf/pipes/extractors/index.html @@ -0,0 +1,2360 @@ + + + + + + + + + + + + + + + + + + + + + + extractors - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                              + +
                                                                                                                              + + + + + + + + +
                                                                                                                              + + +
                                                                                                                              + +
                                                                                                                              + + + + + + +
                                                                                                                              +
                                                                                                                              + + + +
                                                                                                                              +
                                                                                                                              +
                                                                                                                              + + + + +
                                                                                                                              +
                                                                                                                              +
                                                                                                                              + + + +
                                                                                                                              +
                                                                                                                              +
                                                                                                                              + + + +
                                                                                                                              +
                                                                                                                              +
                                                                                                                              + + + +
                                                                                                                              +
                                                                                                                              + + + + + + + +

                                                                                                                              edspdf.pipes.extractors

                                                                                                                              + + +
                                                                                                                              + + + + +
                                                                                                                              + + + +
                                                                                                                              + + + + + + + + + + + +
                                                                                                                              + +
                                                                                                                              + +
                                                                                                                              +

                                                                                                                                + + + + + + +
                                                                                                                                +
                                                                                                                                + + +
                                                                                                                                + +
                                                                                                                                + + + +
                                                                                                                                +
                                                                                                                                +
                                                                                                                                +
                                                                                                                                + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/extractors/pdfminer/index.html b/main/reference/edspdf/pipes/extractors/pdfminer/index.html new file mode 100644 index 00000000..cfd1fb1f --- /dev/null +++ b/main/reference/edspdf/pipes/extractors/pdfminer/index.html @@ -0,0 +1,2714 @@ + + + + + + + + + + + + + + + + + + + + + + pdfminer - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                + +
                                                                                                                                + + + + + + + + +
                                                                                                                                + + +
                                                                                                                                + +
                                                                                                                                + + + + + + +
                                                                                                                                +
                                                                                                                                + + + +
                                                                                                                                +
                                                                                                                                +
                                                                                                                                + + + + +
                                                                                                                                +
                                                                                                                                +
                                                                                                                                + + + +
                                                                                                                                +
                                                                                                                                +
                                                                                                                                + + + +
                                                                                                                                +
                                                                                                                                +
                                                                                                                                + + + +
                                                                                                                                +
                                                                                                                                + + + + + + + +

                                                                                                                                edspdf.pipes.extractors.pdfminer

                                                                                                                                + + +
                                                                                                                                + + + + +
                                                                                                                                + + + +
                                                                                                                                + + + + + + +
                                                                                                                                + + + + +

                                                                                                                                +PdfMinerExtractor + +

                                                                                                                                + + +
                                                                                                                                + + +

                                                                                                                                We provide a PDF line extractor built on top of +PdfMiner.

                                                                                                                                +

                                                                                                                                This is the most portable extractor, since it is pure-python and can therefore +be run on any platform. Be sure to have a look at their documentation, +especially the part providing a bird's eye view of the PDF extraction process.

                                                                                                                                +

                                                                                                                                Examples

                                                                                                                                +
                                                                                                                                +
                                                                                                                                +
                                                                                                                                +
                                                                                                                                pipeline.add_pipe(
                                                                                                                                +    "pdfminer-extractor",
                                                                                                                                +    config=dict(
                                                                                                                                +        extract_style=False,
                                                                                                                                +    ),
                                                                                                                                +)
                                                                                                                                +
                                                                                                                                +
                                                                                                                                +
                                                                                                                                +
                                                                                                                                [components.extractor]
                                                                                                                                +@factory = "pdfminer-extractor"
                                                                                                                                +extract_style = false
                                                                                                                                +
                                                                                                                                +
                                                                                                                                +
                                                                                                                                +
                                                                                                                                +

                                                                                                                                And use the pipeline on a PDF document:

                                                                                                                                +
                                                                                                                                from pathlib import Path
                                                                                                                                +
                                                                                                                                +# Apply on a new document
                                                                                                                                +pipeline(Path("path/to/your/pdf/document").read_bytes())
                                                                                                                                +
                                                                                                                                + +

                                                                                                                                Parameters

                                                                                                                                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                PARAMETERDESCRIPTION
                                                                                                                                line_overlap +

                                                                                                                                See PDFMiner documentation

                                                                                                                                +

                                                                                                                                + + TYPE: + float + + + DEFAULT: + 0.5 + +

                                                                                                                                +
                                                                                                                                char_margin +

                                                                                                                                See PDFMiner documentation

                                                                                                                                +

                                                                                                                                + + TYPE: + float + + + DEFAULT: + 2.05 + +

                                                                                                                                +
                                                                                                                                line_margin +

                                                                                                                                See PDFMiner documentation

                                                                                                                                +

                                                                                                                                + + TYPE: + float + + + DEFAULT: + 0.5 + +

                                                                                                                                +
                                                                                                                                word_margin +

                                                                                                                                See PDFMiner documentation

                                                                                                                                +

                                                                                                                                + + TYPE: + float + + + DEFAULT: + 0.1 + +

                                                                                                                                +
                                                                                                                                boxes_flow +

                                                                                                                                See PDFMiner documentation

                                                                                                                                +

                                                                                                                                + + TYPE: + Optional[float] + + + DEFAULT: + 0.5 + +

                                                                                                                                +
                                                                                                                                detect_vertical +

                                                                                                                                See PDFMiner documentation

                                                                                                                                +

                                                                                                                                + + TYPE: + bool + + + DEFAULT: + False + +

                                                                                                                                +
                                                                                                                                all_texts +

                                                                                                                                See PDFMiner documentation

                                                                                                                                +

                                                                                                                                + + TYPE: + bool + + + DEFAULT: + False + +

                                                                                                                                +
                                                                                                                                extract_style +

                                                                                                                                Whether to extract style (font, size, ...) information for each line of +the document. +Default: False

                                                                                                                                +

                                                                                                                                + + TYPE: + bool + + + DEFAULT: + False + +

                                                                                                                                +
                                                                                                                                render_pages +

                                                                                                                                Whether to extract the rendered page as a numpy array in the page.image +attribute (defaults to False)

                                                                                                                                +

                                                                                                                                + + TYPE: + bool + + + DEFAULT: + False + +

                                                                                                                                +
                                                                                                                                render_dpi +

                                                                                                                                DPI to use when rendering the page (defaults to 200)

                                                                                                                                +

                                                                                                                                + + TYPE: + int + + + DEFAULT: + 200 + +

                                                                                                                                +
                                                                                                                                raise_on_error +

                                                                                                                                Whether to raise an error if the PDF cannot be parsed. +Default: False

                                                                                                                                +

                                                                                                                                + + TYPE: + bool + + + DEFAULT: + False + +

                                                                                                                                +
                                                                                                                                + + + + + +
                                                                                                                                + + + + + + + + + + + +
                                                                                                                                + +
                                                                                                                                + +
                                                                                                                                + + + + +
                                                                                                                                + +
                                                                                                                                + +
                                                                                                                                +

                                                                                                                                  + + + + + + +
                                                                                                                                  +
                                                                                                                                  + + +
                                                                                                                                  + +
                                                                                                                                  + + + +
                                                                                                                                  +
                                                                                                                                  +
                                                                                                                                  +
                                                                                                                                  + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/pipes/index.html b/main/reference/edspdf/pipes/index.html new file mode 100644 index 00000000..c440c8d7 --- /dev/null +++ b/main/reference/edspdf/pipes/index.html @@ -0,0 +1,2358 @@ + + + + + + + + + + + + + + + + + + + + + + pipes - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                  + +
                                                                                                                                  + + + + + + + + +
                                                                                                                                  + + +
                                                                                                                                  + +
                                                                                                                                  + + + + + + +
                                                                                                                                  +
                                                                                                                                  + + + +
                                                                                                                                  +
                                                                                                                                  +
                                                                                                                                  + + + + +
                                                                                                                                  +
                                                                                                                                  +
                                                                                                                                  + + + +
                                                                                                                                  +
                                                                                                                                  +
                                                                                                                                  + + + +
                                                                                                                                  +
                                                                                                                                  +
                                                                                                                                  + + + +
                                                                                                                                  +
                                                                                                                                  + + + + + + + +

                                                                                                                                  edspdf.pipes

                                                                                                                                  + + +
                                                                                                                                  + + + + +
                                                                                                                                  + + + +
                                                                                                                                  + + + + + + + + + + + +
                                                                                                                                  + +
                                                                                                                                  + +
                                                                                                                                  +

                                                                                                                                    + + + + + + +
                                                                                                                                    +
                                                                                                                                    + + +
                                                                                                                                    + +
                                                                                                                                    + + + +
                                                                                                                                    +
                                                                                                                                    +
                                                                                                                                    +
                                                                                                                                    + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/registry/index.html b/main/reference/edspdf/registry/index.html new file mode 100644 index 00000000..232b5e25 --- /dev/null +++ b/main/reference/edspdf/registry/index.html @@ -0,0 +1,2861 @@ + + + + + + + + + + + + + + + + + + + + + + registry - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                    + +
                                                                                                                                    + + + + + + + + +
                                                                                                                                    + + +
                                                                                                                                    + +
                                                                                                                                    + + + + + + +
                                                                                                                                    +
                                                                                                                                    + + + +
                                                                                                                                    +
                                                                                                                                    +
                                                                                                                                    + + + + +
                                                                                                                                    +
                                                                                                                                    +
                                                                                                                                    + + + +
                                                                                                                                    +
                                                                                                                                    +
                                                                                                                                    + + + +
                                                                                                                                    +
                                                                                                                                    +
                                                                                                                                    + + + +
                                                                                                                                    +
                                                                                                                                    + + + + + + + +

                                                                                                                                    edspdf.registry

                                                                                                                                    + + +
                                                                                                                                    + + + + +
                                                                                                                                    + + + +
                                                                                                                                    + + + + + + +
                                                                                                                                    + + + + +

                                                                                                                                    +CurriedFactory + +

                                                                                                                                    + + +
                                                                                                                                    + + + + + + +
                                                                                                                                    + + + + + + + + + +
                                                                                                                                    + + + +

                                                                                                                                    +instantiate + +

                                                                                                                                    + + +
                                                                                                                                    + +

                                                                                                                                    We need to support passing in the pipeline object and name to factories from +a config file. Since components can be nested, we need to add them to every +factory in the config.

                                                                                                                                    + +
                                                                                                                                    + +
                                                                                                                                    + + + +
                                                                                                                                    + +
                                                                                                                                    + +
                                                                                                                                    + + + + +

                                                                                                                                    + FactoryRegistry + + +

                                                                                                                                    + + +
                                                                                                                                    +

                                                                                                                                    + Bases: Registry

                                                                                                                                    + + +

                                                                                                                                    A registry that validates the input arguments of the registered functions.

                                                                                                                                    + + + + + +
                                                                                                                                    + + + + + + + + + +
                                                                                                                                    + + + +

                                                                                                                                    +get + +

                                                                                                                                    + + +
                                                                                                                                    + +

                                                                                                                                    Get the registered function for a given name.

                                                                                                                                    +

                                                                                                                                    name (str): The name. +RETURNS (Any): The registered function.

                                                                                                                                    + +
                                                                                                                                    + +
                                                                                                                                    + +
                                                                                                                                    + + + +

                                                                                                                                    +register + +

                                                                                                                                    + + +
                                                                                                                                    + +

                                                                                                                                    This is a convenience wrapper around confit.Registry.register, that +curries the function to be registered, allowing to instantiate the class +later once pipeline and name are known.

                                                                                                                                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                    PARAMETERDESCRIPTION
                                                                                                                                    name + +

                                                                                                                                    + + TYPE: + str + +

                                                                                                                                    +
                                                                                                                                    func + +

                                                                                                                                    + + TYPE: + Optional[InFunc] + + + DEFAULT: + None + +

                                                                                                                                    +
                                                                                                                                    default_config + +

                                                                                                                                    + + TYPE: + Dict[str, Any] + + + DEFAULT: + FrozenDict() + +

                                                                                                                                    +
                                                                                                                                    assigns + +

                                                                                                                                    + + TYPE: + Iterable[str] + + + DEFAULT: + FrozenList() + +

                                                                                                                                    +
                                                                                                                                    requires + +

                                                                                                                                    + + TYPE: + Iterable[str] + + + DEFAULT: + FrozenList() + +

                                                                                                                                    +
                                                                                                                                    retokenizes + +

                                                                                                                                    + + TYPE: + bool + + + DEFAULT: + False + +

                                                                                                                                    +
                                                                                                                                    default_score_weights + +

                                                                                                                                    + + TYPE: + Dict[str, Optional[float]] + + + DEFAULT: + FrozenDict() + +

                                                                                                                                    +
                                                                                                                                    + + + + + + + + + + + + + + + + +
                                                                                                                                    RETURNSDESCRIPTION
                                                                                                                                    + + Callable[[InFunc], InFunc] + + +
                                                                                                                                    + +
                                                                                                                                    +
                                                                                                                                    + +
                                                                                                                                    + +
                                                                                                                                    + + + +
                                                                                                                                    + +
                                                                                                                                    + +
                                                                                                                                    + + +
                                                                                                                                    + + + +

                                                                                                                                    +accepted_arguments + +

                                                                                                                                    + + +
                                                                                                                                    + +

                                                                                                                                    Checks that a function accepts a list of keyword arguments

                                                                                                                                    + + + + + + + + + + + + + + + + + + +
                                                                                                                                    PARAMETERDESCRIPTION
                                                                                                                                    func +

                                                                                                                                    Function to check

                                                                                                                                    +

                                                                                                                                    + + TYPE: + Callable + +

                                                                                                                                    +
                                                                                                                                    args +

                                                                                                                                    Argument or list of arguments to check

                                                                                                                                    +

                                                                                                                                    + + TYPE: + Sequence[str] + +

                                                                                                                                    +
                                                                                                                                    + + + + + + + + + + + + + + + + +
                                                                                                                                    RETURNSDESCRIPTION
                                                                                                                                    + + List[str] + + +
                                                                                                                                    + +
                                                                                                                                    +
                                                                                                                                    + +
                                                                                                                                    + +
                                                                                                                                    + + + +
                                                                                                                                    + +
                                                                                                                                    + +
                                                                                                                                    +

                                                                                                                                      + + + + + + +
                                                                                                                                      +
                                                                                                                                      + + +
                                                                                                                                      + +
                                                                                                                                      + + + +
                                                                                                                                      +
                                                                                                                                      +
                                                                                                                                      +
                                                                                                                                      + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/structures/index.html b/main/reference/edspdf/structures/index.html new file mode 100644 index 00000000..fdde26f8 --- /dev/null +++ b/main/reference/edspdf/structures/index.html @@ -0,0 +1,3218 @@ + + + + + + + + + + + + + + + + + + + + + + structures - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                      + +
                                                                                                                                      + + + + + + + + +
                                                                                                                                      + + +
                                                                                                                                      + +
                                                                                                                                      + + + + + + +
                                                                                                                                      +
                                                                                                                                      + + + +
                                                                                                                                      +
                                                                                                                                      +
                                                                                                                                      + + + + +
                                                                                                                                      +
                                                                                                                                      +
                                                                                                                                      + + + +
                                                                                                                                      +
                                                                                                                                      +
                                                                                                                                      + + + +
                                                                                                                                      +
                                                                                                                                      +
                                                                                                                                      + + + +
                                                                                                                                      +
                                                                                                                                      + + + + + + + +

                                                                                                                                      edspdf.structures

                                                                                                                                      + + +
                                                                                                                                      + + + + +
                                                                                                                                      + + + +
                                                                                                                                      + + + + + + +
                                                                                                                                      + + + + +

                                                                                                                                      + PDFDoc + + +

                                                                                                                                      + + +
                                                                                                                                      +

                                                                                                                                      + Bases: BaseModel

                                                                                                                                      + + +

                                                                                                                                      This is the main data structure of the library to hold PDFs. +It contains the content of the PDF, as well as box annotations and text outputs.

                                                                                                                                      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                      ATTRIBUTEDESCRIPTION
                                                                                                                                      content +
                                                                                                                                      +

                                                                                                                                      The content of the PDF document.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + bytes + +

                                                                                                                                      +
                                                                                                                                      id +
                                                                                                                                      +

                                                                                                                                      The ID of the PDF document.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + (str, optional) + +

                                                                                                                                      +
                                                                                                                                      pages +
                                                                                                                                      +

                                                                                                                                      The pages of the PDF document.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + List[Page] + +

                                                                                                                                      +
                                                                                                                                      error +
                                                                                                                                      +

                                                                                                                                      Whether there was an error when processing this PDF document.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + (bool, optional) + +

                                                                                                                                      +
                                                                                                                                      content_boxes +
                                                                                                                                      +

                                                                                                                                      The content boxes/annotations of the PDF document.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + List[Union[TextBox, ImageBox]] + +

                                                                                                                                      +
                                                                                                                                      aggregated_texts +
                                                                                                                                      +

                                                                                                                                      The aggregated text outputs of the PDF document.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + Dict[str, Text] + +

                                                                                                                                      +
                                                                                                                                      text_boxes +
                                                                                                                                      +

                                                                                                                                      The text boxes of the PDF document.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + List[TextBox] + +

                                                                                                                                      +
                                                                                                                                      + + + + + +
                                                                                                                                      + + + + + + + + + + + +
                                                                                                                                      + +
                                                                                                                                      + +
                                                                                                                                      + + + + +

                                                                                                                                      + Page + + +

                                                                                                                                      + + +
                                                                                                                                      +

                                                                                                                                      + Bases: BaseModel

                                                                                                                                      + + +

                                                                                                                                      The Page class represents a page of a PDF document.

                                                                                                                                      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                      ATTRIBUTEDESCRIPTION
                                                                                                                                      page_num +
                                                                                                                                      +

                                                                                                                                      The page number of the page.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + int + +

                                                                                                                                      +
                                                                                                                                      width +
                                                                                                                                      +

                                                                                                                                      The width of the page.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + float + +

                                                                                                                                      +
                                                                                                                                      height +
                                                                                                                                      +

                                                                                                                                      The height of the page.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + float + +

                                                                                                                                      +
                                                                                                                                      doc +
                                                                                                                                      +

                                                                                                                                      The PDF document that this page belongs to.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + PDFDoc + +

                                                                                                                                      +
                                                                                                                                      image +
                                                                                                                                      +

                                                                                                                                      The rendered image of the page, stored as a NumPy array.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + Optional[ndarray] + +

                                                                                                                                      +
                                                                                                                                      text_boxes +
                                                                                                                                      +

                                                                                                                                      The text boxes of the page.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + List[TextBox] + +

                                                                                                                                      +
                                                                                                                                      + + + + + +
                                                                                                                                      + + + + + + + + + + + +
                                                                                                                                      + +
                                                                                                                                      + +
                                                                                                                                      + + + + +

                                                                                                                                      + TextProperties + + +

                                                                                                                                      + + +
                                                                                                                                      +

                                                                                                                                      + Bases: BaseModel

                                                                                                                                      + + +

                                                                                                                                      The TextProperties class represents the style properties of a span of text in a +TextBox.

                                                                                                                                      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                      ATTRIBUTEDESCRIPTION
                                                                                                                                      italic +
                                                                                                                                      +

                                                                                                                                      Whether the text is italic.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + bool + +

                                                                                                                                      +
                                                                                                                                      bold +
                                                                                                                                      +

                                                                                                                                      Whether the text is bold.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + bool + +

                                                                                                                                      +
                                                                                                                                      begin +
                                                                                                                                      +

                                                                                                                                      The beginning index of the span of text.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + int + +

                                                                                                                                      +
                                                                                                                                      end +
                                                                                                                                      +

                                                                                                                                      The ending index of the span of text.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + int + +

                                                                                                                                      +
                                                                                                                                      fontname +
                                                                                                                                      +

                                                                                                                                      The font name of the span of text.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + Optional[str] + +

                                                                                                                                      +
                                                                                                                                      + + + + + +
                                                                                                                                      + + + + + + + + + + + +
                                                                                                                                      + +
                                                                                                                                      + +
                                                                                                                                      + + + + +

                                                                                                                                      + Box + + +

                                                                                                                                      + + +
                                                                                                                                      +

                                                                                                                                      + Bases: BaseModel

                                                                                                                                      + + +

                                                                                                                                      The Box class represents a box annotation in a PDF document. It is the base class +of TextBox.

                                                                                                                                      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                      ATTRIBUTEDESCRIPTION
                                                                                                                                      doc +
                                                                                                                                      +

                                                                                                                                      The PDF document that this box belongs to.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + PDFDoc + +

                                                                                                                                      +
                                                                                                                                      page_num +
                                                                                                                                      +

                                                                                                                                      The page number of the box.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + Optional[int] + +

                                                                                                                                      +
                                                                                                                                      x0 +
                                                                                                                                      +

                                                                                                                                      The left x-coordinate of the box.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + float + +

                                                                                                                                      +
                                                                                                                                      x1 +
                                                                                                                                      +

                                                                                                                                      The right x-coordinate of the box.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + float + +

                                                                                                                                      +
                                                                                                                                      y0 +
                                                                                                                                      +

                                                                                                                                      The top y-coordinate of the box.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + float + +

                                                                                                                                      +
                                                                                                                                      y1 +
                                                                                                                                      +

                                                                                                                                      The bottom y-coordinate of the box.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + float + +

                                                                                                                                      +
                                                                                                                                      label +
                                                                                                                                      +

                                                                                                                                      The label of the box.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + Optional[str] + +

                                                                                                                                      +
                                                                                                                                      page +
                                                                                                                                      +

                                                                                                                                      The page object that this box belongs to.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + Page + +

                                                                                                                                      +
                                                                                                                                      + + + + + +
                                                                                                                                      + + + + + + + + + + + +
                                                                                                                                      + +
                                                                                                                                      + +
                                                                                                                                      + + + + +

                                                                                                                                      + Text + + +

                                                                                                                                      + + +
                                                                                                                                      +

                                                                                                                                      + Bases: BaseModel

                                                                                                                                      + + +

                                                                                                                                      The TextBox class represents text object, not bound to any box.

                                                                                                                                      +

                                                                                                                                      It can be used to store aggregated text from multiple boxes for example.

                                                                                                                                      + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                      ATTRIBUTEDESCRIPTION
                                                                                                                                      text +
                                                                                                                                      +

                                                                                                                                      The text content.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + str + +

                                                                                                                                      +
                                                                                                                                      properties +
                                                                                                                                      +

                                                                                                                                      The style properties of the text.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + List[TextProperties] + +

                                                                                                                                      +
                                                                                                                                      + + + + + +
                                                                                                                                      + + + + + + + + + + + +
                                                                                                                                      + +
                                                                                                                                      + +
                                                                                                                                      + + + + +

                                                                                                                                      + TextBox + + +

                                                                                                                                      + + +
                                                                                                                                      +

                                                                                                                                      + Bases: Box

                                                                                                                                      + + +

                                                                                                                                      The TextBox class represents a text box annotation in a PDF document.

                                                                                                                                      + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                      ATTRIBUTEDESCRIPTION
                                                                                                                                      text +
                                                                                                                                      +

                                                                                                                                      The text content of the text box.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + str + +

                                                                                                                                      +
                                                                                                                                      props +
                                                                                                                                      +

                                                                                                                                      The style properties of the text box.

                                                                                                                                      +
                                                                                                                                      +

                                                                                                                                      + + TYPE: + List[TextProperties] + +

                                                                                                                                      +
                                                                                                                                      + + + + + +
                                                                                                                                      + + + + + + + + + + + +
                                                                                                                                      + +
                                                                                                                                      + +
                                                                                                                                      + + + + +
                                                                                                                                      + +
                                                                                                                                      + +
                                                                                                                                      +

                                                                                                                                        + + + + + + +
                                                                                                                                        +
                                                                                                                                        + + +
                                                                                                                                        + +
                                                                                                                                        + + + +
                                                                                                                                        +
                                                                                                                                        +
                                                                                                                                        +
                                                                                                                                        + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/trainable_pipe/index.html b/main/reference/edspdf/trainable_pipe/index.html new file mode 100644 index 00000000..e6520227 --- /dev/null +++ b/main/reference/edspdf/trainable_pipe/index.html @@ -0,0 +1,3405 @@ + + + + + + + + + + + + + + + + + + + + + + trainable_pipe - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                        + +
                                                                                                                                        + + + + + + + + +
                                                                                                                                        + + +
                                                                                                                                        + +
                                                                                                                                        + + + + + + +
                                                                                                                                        +
                                                                                                                                        + + + +
                                                                                                                                        +
                                                                                                                                        +
                                                                                                                                        + + + + +
                                                                                                                                        +
                                                                                                                                        +
                                                                                                                                        + + + +
                                                                                                                                        +
                                                                                                                                        +
                                                                                                                                        + + + +
                                                                                                                                        +
                                                                                                                                        +
                                                                                                                                        + + + +
                                                                                                                                        +
                                                                                                                                        + + + + + + + +

                                                                                                                                        edspdf.trainable_pipe

                                                                                                                                        + + +
                                                                                                                                        + + + + +
                                                                                                                                        + + + +
                                                                                                                                        + + + + + + +
                                                                                                                                        + + + + +

                                                                                                                                        +TrainablePipe + +

                                                                                                                                        + + +
                                                                                                                                        +

                                                                                                                                        + Bases: Module, Generic[OutputBatch]

                                                                                                                                        + + +

                                                                                                                                        A TrainablePipe is a Component that can be trained and inherits torch.nn.Module. +You can use it either as a torch module inside a more complex neural network, or as +a standalone component in a Pipeline.

                                                                                                                                        +

                                                                                                                                        In addition to the methods of a torch module, a TrainablePipe adds a few methods to +handle preprocessing and collating features, as well as caching intermediate results +for components that share a common subcomponent.

                                                                                                                                        + + + + + +
                                                                                                                                        + + + + + + + + + +
                                                                                                                                        + + + +

                                                                                                                                        +save_extra_data + +

                                                                                                                                        + + +
                                                                                                                                        + +

                                                                                                                                        Dumps vocabularies indices to json files

                                                                                                                                        + + + + + + + + + + + + + + + + + + +
                                                                                                                                        PARAMETERDESCRIPTION
                                                                                                                                        path +

                                                                                                                                        Path to the directory where the files will be saved

                                                                                                                                        +

                                                                                                                                        + + TYPE: + Path + +

                                                                                                                                        +
                                                                                                                                        exclude +

                                                                                                                                        The set of component names to exclude from saving +This is useful when components are repeated in the pipeline.

                                                                                                                                        +

                                                                                                                                        + + TYPE: + set + +

                                                                                                                                        +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + + + +

                                                                                                                                        +load_extra_data + +

                                                                                                                                        + + +
                                                                                                                                        + +

                                                                                                                                        Loads vocabularies indices from json files

                                                                                                                                        + + + + + + + + + + + + + + + + + + +
                                                                                                                                        PARAMETERDESCRIPTION
                                                                                                                                        path +

                                                                                                                                        Path to the directory where the files will be loaded

                                                                                                                                        +

                                                                                                                                        + + TYPE: + Path + +

                                                                                                                                        +
                                                                                                                                        exclude +

                                                                                                                                        The set of component names to exclude from loading +This is useful when components are repeated in the pipeline.

                                                                                                                                        +

                                                                                                                                        + + TYPE: + set + +

                                                                                                                                        +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + + + +

                                                                                                                                        +post_init + +

                                                                                                                                        + + +
                                                                                                                                        + +

                                                                                                                                        This method completes the attributes of the component, by looking at some +documents. It is especially useful to build vocabularies or detect the labels +of a classification task.

                                                                                                                                        + + + + + + + + + + + + + + + + + + +
                                                                                                                                        PARAMETERDESCRIPTION
                                                                                                                                        gold_data +

                                                                                                                                        The documents to use for initialization.

                                                                                                                                        +

                                                                                                                                        + + TYPE: + Iterable[PDFDoc] + +

                                                                                                                                        +
                                                                                                                                        exclude +

                                                                                                                                        The names of components to exclude from initialization. +This argument will be gradually updated with the names of initialized +components

                                                                                                                                        +

                                                                                                                                        + + TYPE: + set + +

                                                                                                                                        +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + + + +

                                                                                                                                        +preprocess + +

                                                                                                                                        + + +
                                                                                                                                        + +

                                                                                                                                        Preprocess the document to extract features that will be used by the +neural network to perform its predictions.

                                                                                                                                        + + + + + + + + + + + + + + +
                                                                                                                                        PARAMETERDESCRIPTION
                                                                                                                                        doc +

                                                                                                                                        PDFDocument to preprocess

                                                                                                                                        +

                                                                                                                                        + + TYPE: + PDFDoc + +

                                                                                                                                        +
                                                                                                                                        + + + + + + + + + + + + + + + + +
                                                                                                                                        RETURNSDESCRIPTION
                                                                                                                                        + + Dict[str, Any] + + +
                                                                                                                                        +

                                                                                                                                        Dictionary (optionally nested) containing the features extracted from +the document.

                                                                                                                                        +
                                                                                                                                        +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + + + +

                                                                                                                                        +collate + +

                                                                                                                                        + + +
                                                                                                                                        + +

                                                                                                                                        Collate the batch of features into a single batch of tensors that can be +used by the forward method of the component.

                                                                                                                                        + + + + + + + + + + + + + + + + + + +
                                                                                                                                        PARAMETERDESCRIPTION
                                                                                                                                        batch +

                                                                                                                                        Batch of features

                                                                                                                                        +

                                                                                                                                        + + TYPE: + NestedSequences + +

                                                                                                                                        +
                                                                                                                                        device +

                                                                                                                                        Device on which the tensors should be moved

                                                                                                                                        +

                                                                                                                                        + + TYPE: + device + +

                                                                                                                                        +
                                                                                                                                        + + + + + + + + + + + + + + + + +
                                                                                                                                        RETURNSDESCRIPTION
                                                                                                                                        + + InputBatch + + +
                                                                                                                                        +

                                                                                                                                        Dictionary (optionally nested) containing the collated tensors

                                                                                                                                        +
                                                                                                                                        +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + + + +

                                                                                                                                        +forward + +

                                                                                                                                        + + +
                                                                                                                                        + +

                                                                                                                                        Perform the forward pass of the neural network, i.e, apply transformations +over the collated features to compute new embeddings, probabilities, losses, etc

                                                                                                                                        + + + + + + + + + + + + + + +
                                                                                                                                        PARAMETERDESCRIPTION
                                                                                                                                        batch +

                                                                                                                                        Batch of tensors (nested dictionary) computed by the collate method

                                                                                                                                        +

                                                                                                                                        + + TYPE: + InputBatch + +

                                                                                                                                        +
                                                                                                                                        + + + + + + + + + + + + + + + + +
                                                                                                                                        RETURNSDESCRIPTION
                                                                                                                                        + + OutputBatch + + +
                                                                                                                                        + +
                                                                                                                                        +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + + + +

                                                                                                                                        +module_forward + +

                                                                                                                                        + + +
                                                                                                                                        + +

                                                                                                                                        This is a wrapper around torch.nn.Module.__call__ to avoid conflict +with the +TrainablePipe.__call__ +method.

                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + + + +

                                                                                                                                        +make_batch + +

                                                                                                                                        + + +
                                                                                                                                        + +

                                                                                                                                        Convenience method to preprocess a batch of documents and collate them +Features corresponding to the same path are grouped together in a list, +under the same key.

                                                                                                                                        + + + + + + + + + + + + + + + + + + +
                                                                                                                                        PARAMETERDESCRIPTION
                                                                                                                                        docs +

                                                                                                                                        Batch of documents

                                                                                                                                        +

                                                                                                                                        + + TYPE: + Sequence[PDFDoc] + +

                                                                                                                                        +
                                                                                                                                        supervision +

                                                                                                                                        Whether to extract supervision features or not

                                                                                                                                        +

                                                                                                                                        + + TYPE: + bool + + + DEFAULT: + False + +

                                                                                                                                        +
                                                                                                                                        + + + + + + + + + + + + + + + + +
                                                                                                                                        RETURNSDESCRIPTION
                                                                                                                                        + + Dict[str, Sequence[Any]] + + +
                                                                                                                                        + +
                                                                                                                                        +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + + + +

                                                                                                                                        +batch_process + +

                                                                                                                                        + + +
                                                                                                                                        + +

                                                                                                                                        Process a batch of documents using the neural network. +This differs from the pipe method in that it does not return an +iterator, but executes the component on the whole batch at once.

                                                                                                                                        + + + + + + + + + + + + + + +
                                                                                                                                        PARAMETERDESCRIPTION
                                                                                                                                        docs +

                                                                                                                                        Batch of documents

                                                                                                                                        +

                                                                                                                                        + + TYPE: + Sequence[PDFDoc] + +

                                                                                                                                        +
                                                                                                                                        + + + + + + + + + + + + + + + + +
                                                                                                                                        RETURNSDESCRIPTION
                                                                                                                                        + + Sequence[PDFDoc] + + +
                                                                                                                                        +

                                                                                                                                        Batch of updated documents

                                                                                                                                        +
                                                                                                                                        +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + + + +

                                                                                                                                        +postprocess + +

                                                                                                                                        + + +
                                                                                                                                        + +

                                                                                                                                        Update the documents with the predictions of the neural network, for instance +converting label probabilities into label attributes on the document lines.

                                                                                                                                        +

                                                                                                                                        By default, this is a no-op.

                                                                                                                                        + + + + + + + + + + + + + + + + + + +
                                                                                                                                        PARAMETERDESCRIPTION
                                                                                                                                        docs +

                                                                                                                                        Batch of documents

                                                                                                                                        +

                                                                                                                                        + + TYPE: + Sequence[PDFDoc] + +

                                                                                                                                        +
                                                                                                                                        batch +

                                                                                                                                        Batch of predictions, as returned by the forward method

                                                                                                                                        +

                                                                                                                                        + + TYPE: + OutputBatch + +

                                                                                                                                        +
                                                                                                                                        + + + + + + + + + + + + + + + + +
                                                                                                                                        RETURNSDESCRIPTION
                                                                                                                                        + + Sequence[PDFDoc] + + +
                                                                                                                                        + +
                                                                                                                                        +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + + + +

                                                                                                                                        +preprocess_supervised + +

                                                                                                                                        + + +
                                                                                                                                        + +

                                                                                                                                        Preprocess the document to extract features that will be used by the +neural network to perform its training. +By default, this returns the same features as the preprocess method.

                                                                                                                                        + + + + + + + + + + + + + + +
                                                                                                                                        PARAMETERDESCRIPTION
                                                                                                                                        doc +

                                                                                                                                        PDFDocument to preprocess

                                                                                                                                        +

                                                                                                                                        + + TYPE: + PDFDoc + +

                                                                                                                                        +
                                                                                                                                        + + + + + + + + + + + + + + + + +
                                                                                                                                        RETURNSDESCRIPTION
                                                                                                                                        + + Dict[str, Any] + + +
                                                                                                                                        +

                                                                                                                                        Dictionary (optionally nested) containing the features extracted from +the document.

                                                                                                                                        +
                                                                                                                                        +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + + + +

                                                                                                                                        +__call__ + +

                                                                                                                                        + + +
                                                                                                                                        + +

                                                                                                                                        Applies the component on a single doc. +For multiple documents, prefer batch processing via the +batch_process method. +In general, prefer the Pipeline methods

                                                                                                                                        + + + + + + + + + + + + + + +
                                                                                                                                        PARAMETERDESCRIPTION
                                                                                                                                        doc + +

                                                                                                                                        + + TYPE: + PDFDoc + +

                                                                                                                                        +
                                                                                                                                        + + + + + + + + + + + + + + + + +
                                                                                                                                        RETURNSDESCRIPTION
                                                                                                                                        + + PDFDoc + + +
                                                                                                                                        + +
                                                                                                                                        +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + + + +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        + + + + +
                                                                                                                                        + +
                                                                                                                                        + +
                                                                                                                                        +

                                                                                                                                          + + + + + + +
                                                                                                                                          +
                                                                                                                                          + + +
                                                                                                                                          + +
                                                                                                                                          + + + +
                                                                                                                                          +
                                                                                                                                          +
                                                                                                                                          +
                                                                                                                                          + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/utils/alignment/index.html b/main/reference/edspdf/utils/alignment/index.html new file mode 100644 index 00000000..5f4c927c --- /dev/null +++ b/main/reference/edspdf/utils/alignment/index.html @@ -0,0 +1,2526 @@ + + + + + + + + + + + + + + + + + + + + + + alignment - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                          + +
                                                                                                                                          + + + + + + + + +
                                                                                                                                          + + +
                                                                                                                                          + +
                                                                                                                                          + + + + + + +
                                                                                                                                          +
                                                                                                                                          + + + +
                                                                                                                                          +
                                                                                                                                          +
                                                                                                                                          + + + + +
                                                                                                                                          +
                                                                                                                                          +
                                                                                                                                          + + + +
                                                                                                                                          +
                                                                                                                                          +
                                                                                                                                          + + + +
                                                                                                                                          +
                                                                                                                                          +
                                                                                                                                          + + + +
                                                                                                                                          +
                                                                                                                                          + + + + + + + +

                                                                                                                                          edspdf.utils.alignment

                                                                                                                                          + + +
                                                                                                                                          + + + + +
                                                                                                                                          + + + +
                                                                                                                                          + + + + + + + + + +
                                                                                                                                          + + + +

                                                                                                                                          +align_box_labels + +

                                                                                                                                          + + +
                                                                                                                                          + +

                                                                                                                                          Align lines with possibly overlapping (and non-exhaustive) labels.

                                                                                                                                          +

                                                                                                                                          Possible matches are sorted by covered area. Lines with no overlap at all

                                                                                                                                          + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                          PARAMETERDESCRIPTION
                                                                                                                                          src_boxes +

                                                                                                                                          The labelled boxes that will be used to determine the label of the dst_boxes

                                                                                                                                          +

                                                                                                                                          + + TYPE: + Sequence[Box] + +

                                                                                                                                          +
                                                                                                                                          dst_boxes +

                                                                                                                                          The non-labelled boxes that will be assigned a label

                                                                                                                                          +

                                                                                                                                          + + TYPE: + Sequence[T] + +

                                                                                                                                          +
                                                                                                                                          threshold +

                                                                                                                                          Threshold to use for discounting a label. Used if the labels DataFrame +does not provide a threshold column, or to fill NaN values thereof.

                                                                                                                                          +

                                                                                                                                          + + TYPE: + float + + + DEFAULT: + 1 + +

                                                                                                                                          +
                                                                                                                                          pollution_label +

                                                                                                                                          The label to use for boxes that are not covered by any of the source boxes

                                                                                                                                          +

                                                                                                                                          + + TYPE: + Any + + + DEFAULT: + None + +

                                                                                                                                          +
                                                                                                                                          + + + + + + + + + + + + + + + + +
                                                                                                                                          RETURNSDESCRIPTION
                                                                                                                                          + + List[Box] + + +
                                                                                                                                          +

                                                                                                                                          A copy of the boxes, with the labels mapped from the source boxes

                                                                                                                                          +
                                                                                                                                          +
                                                                                                                                          + +
                                                                                                                                          + +
                                                                                                                                          + + + +
                                                                                                                                          + +
                                                                                                                                          + +
                                                                                                                                          +

                                                                                                                                            + + + + + + +
                                                                                                                                            +
                                                                                                                                            + + +
                                                                                                                                            + +
                                                                                                                                            + + + +
                                                                                                                                            +
                                                                                                                                            +
                                                                                                                                            +
                                                                                                                                            + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/utils/collections/index.html b/main/reference/edspdf/utils/collections/index.html new file mode 100644 index 00000000..a97ac661 --- /dev/null +++ b/main/reference/edspdf/utils/collections/index.html @@ -0,0 +1,2563 @@ + + + + + + + + + + + + + + + + + + + + + + collections - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                            + +
                                                                                                                                            + + + + + + + + +
                                                                                                                                            + + +
                                                                                                                                            + +
                                                                                                                                            + + + + + + +
                                                                                                                                            +
                                                                                                                                            + + + +
                                                                                                                                            +
                                                                                                                                            +
                                                                                                                                            + + + + +
                                                                                                                                            +
                                                                                                                                            +
                                                                                                                                            + + + +
                                                                                                                                            +
                                                                                                                                            +
                                                                                                                                            + + + +
                                                                                                                                            +
                                                                                                                                            +
                                                                                                                                            + + + +
                                                                                                                                            +
                                                                                                                                            + + + + + + + +

                                                                                                                                            edspdf.utils.collections

                                                                                                                                            + + +
                                                                                                                                            + + + + +
                                                                                                                                            + + + +
                                                                                                                                            + + + + + + +
                                                                                                                                            + + + + +

                                                                                                                                            +multi_tee + +

                                                                                                                                            + + +
                                                                                                                                            + + +

                                                                                                                                            Makes copies of an iterable such that every iteration over it +starts from 0. If the iterable is a sequence (list, tuple), just returns +it since every iter() over the object restart from the beginning

                                                                                                                                            + + + + + +
                                                                                                                                            + + + + + + + + + + + +
                                                                                                                                            + +
                                                                                                                                            + +
                                                                                                                                            + + + + +

                                                                                                                                            +FrozenDict + +

                                                                                                                                            + + +
                                                                                                                                            +

                                                                                                                                            + Bases: dict

                                                                                                                                            + + +

                                                                                                                                            Copied from spacy.util.SimpleFrozenDict to ensure compatibility.

                                                                                                                                            + + +

                                                                                                                                            Initialize the frozen dict. Can be initialized with pre-defined +values.

                                                                                                                                            +

                                                                                                                                            error (str): The error message when user tries to assign to dict.

                                                                                                                                            + + + + +
                                                                                                                                            + + + + + + + + + + + +
                                                                                                                                            + +
                                                                                                                                            + +
                                                                                                                                            + + + + +

                                                                                                                                            +FrozenList + +

                                                                                                                                            + + +
                                                                                                                                            +

                                                                                                                                            + Bases: list

                                                                                                                                            + + +

                                                                                                                                            Copied from spacy.util.SimpleFrozenDict to ensure compatibility

                                                                                                                                            + + +

                                                                                                                                            Initialize the frozen list.

                                                                                                                                            +

                                                                                                                                            error (str): The error message when user tries to mutate the list.

                                                                                                                                            + + + + +
                                                                                                                                            + + + + + + + + + + + +
                                                                                                                                            + +
                                                                                                                                            + +
                                                                                                                                            + + + + +
                                                                                                                                            + +
                                                                                                                                            + +
                                                                                                                                            +

                                                                                                                                              + + + + + + +
                                                                                                                                              +
                                                                                                                                              + + +
                                                                                                                                              + +
                                                                                                                                              + + + +
                                                                                                                                              +
                                                                                                                                              +
                                                                                                                                              +
                                                                                                                                              + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/utils/index.html b/main/reference/edspdf/utils/index.html new file mode 100644 index 00000000..fa74de9f --- /dev/null +++ b/main/reference/edspdf/utils/index.html @@ -0,0 +1,2358 @@ + + + + + + + + + + + + + + + + + + + + + + utils - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                              + +
                                                                                                                                              + + + + + + + + +
                                                                                                                                              + + +
                                                                                                                                              + +
                                                                                                                                              + + + + + + +
                                                                                                                                              +
                                                                                                                                              + + + +
                                                                                                                                              +
                                                                                                                                              +
                                                                                                                                              + + + + +
                                                                                                                                              +
                                                                                                                                              +
                                                                                                                                              + + + +
                                                                                                                                              +
                                                                                                                                              +
                                                                                                                                              + + + +
                                                                                                                                              +
                                                                                                                                              +
                                                                                                                                              + + + +
                                                                                                                                              +
                                                                                                                                              + + + + + + + +

                                                                                                                                              edspdf.utils

                                                                                                                                              + + +
                                                                                                                                              + + + + +
                                                                                                                                              + + + +
                                                                                                                                              + + + + + + + + + + + +
                                                                                                                                              + +
                                                                                                                                              + +
                                                                                                                                              +

                                                                                                                                                + + + + + + +
                                                                                                                                                +
                                                                                                                                                + + +
                                                                                                                                                + +
                                                                                                                                                + + + +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/utils/optimization/index.html b/main/reference/edspdf/utils/optimization/index.html new file mode 100644 index 00000000..77095743 --- /dev/null +++ b/main/reference/edspdf/utils/optimization/index.html @@ -0,0 +1,2368 @@ + + + + + + + + + + + + + + + + + + + + + + optimization - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                                + +
                                                                                                                                                + + + + + + + + +
                                                                                                                                                + + +
                                                                                                                                                + +
                                                                                                                                                + + + + + + +
                                                                                                                                                +
                                                                                                                                                + + + +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                + + + + +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                + + + +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                + + + +
                                                                                                                                                +
                                                                                                                                                +
                                                                                                                                                + + + +
                                                                                                                                                +
                                                                                                                                                + + + + + + + +

                                                                                                                                                edspdf.utils.optimization

                                                                                                                                                + + +
                                                                                                                                                + + + + +
                                                                                                                                                + + + +
                                                                                                                                                + + + + + + + + + + + +
                                                                                                                                                + +
                                                                                                                                                + +
                                                                                                                                                +

                                                                                                                                                  + + + + + + +
                                                                                                                                                  +
                                                                                                                                                  + + +
                                                                                                                                                  + +
                                                                                                                                                  + + + +
                                                                                                                                                  +
                                                                                                                                                  +
                                                                                                                                                  +
                                                                                                                                                  + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/utils/package/index.html b/main/reference/edspdf/utils/package/index.html new file mode 100644 index 00000000..467755f6 --- /dev/null +++ b/main/reference/edspdf/utils/package/index.html @@ -0,0 +1,2492 @@ + + + + + + + + + + + + + + + + + + + + + + package - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                                  + +
                                                                                                                                                  + + + + + + + + +
                                                                                                                                                  + + +
                                                                                                                                                  + +
                                                                                                                                                  + + + + + + +
                                                                                                                                                  +
                                                                                                                                                  + + + +
                                                                                                                                                  +
                                                                                                                                                  +
                                                                                                                                                  + + + + +
                                                                                                                                                  +
                                                                                                                                                  +
                                                                                                                                                  + + + +
                                                                                                                                                  +
                                                                                                                                                  +
                                                                                                                                                  + + + +
                                                                                                                                                  +
                                                                                                                                                  +
                                                                                                                                                  + + + +
                                                                                                                                                  +
                                                                                                                                                  + + + + + + + +

                                                                                                                                                  edspdf.utils.package

                                                                                                                                                  + + +
                                                                                                                                                  + + + + +
                                                                                                                                                  + + + +
                                                                                                                                                  + + + + + + +
                                                                                                                                                  + + + + +

                                                                                                                                                  +PoetryPackager + +

                                                                                                                                                  + + +
                                                                                                                                                  + + + + + + +
                                                                                                                                                  + + + + + + + + + +
                                                                                                                                                  + + + +

                                                                                                                                                  +ensure_pyproject + +

                                                                                                                                                  + + +
                                                                                                                                                  + +

                                                                                                                                                  Generates a Poetry based pyproject.toml

                                                                                                                                                  + +
                                                                                                                                                  + +
                                                                                                                                                  + + + +
                                                                                                                                                  + +
                                                                                                                                                  + +
                                                                                                                                                  + + + + +
                                                                                                                                                  + +
                                                                                                                                                  + +
                                                                                                                                                  +

                                                                                                                                                    + + + + + + +
                                                                                                                                                    +
                                                                                                                                                    + + +
                                                                                                                                                    + +
                                                                                                                                                    + + + +
                                                                                                                                                    +
                                                                                                                                                    +
                                                                                                                                                    +
                                                                                                                                                    + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/utils/random/index.html b/main/reference/edspdf/utils/random/index.html new file mode 100644 index 00000000..a58ef30f --- /dev/null +++ b/main/reference/edspdf/utils/random/index.html @@ -0,0 +1,2641 @@ + + + + + + + + + + + + + + + + + + + + + + random - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                                    + +
                                                                                                                                                    + + + + + + + + +
                                                                                                                                                    + + +
                                                                                                                                                    + +
                                                                                                                                                    + + + + + + +
                                                                                                                                                    +
                                                                                                                                                    + + + +
                                                                                                                                                    +
                                                                                                                                                    +
                                                                                                                                                    + + + + +
                                                                                                                                                    +
                                                                                                                                                    +
                                                                                                                                                    + + + +
                                                                                                                                                    +
                                                                                                                                                    +
                                                                                                                                                    + + + +
                                                                                                                                                    +
                                                                                                                                                    +
                                                                                                                                                    + + + +
                                                                                                                                                    +
                                                                                                                                                    + + + + + + + +

                                                                                                                                                    edspdf.utils.random

                                                                                                                                                    + + +
                                                                                                                                                    + + + + +
                                                                                                                                                    + + + +
                                                                                                                                                    + + + + + + +
                                                                                                                                                    + + + + +

                                                                                                                                                    +set_seed + +

                                                                                                                                                    + + +
                                                                                                                                                    + + + +

                                                                                                                                                    Set seed values for random generators. +If used as a context, restore the random state +used before entering the context.

                                                                                                                                                    + +

                                                                                                                                                    Parameters

                                                                                                                                                    + + + + + + + + + + + + + + + + + +
                                                                                                                                                    PARAMETERDESCRIPTION
                                                                                                                                                    seed +

                                                                                                                                                    Value used as a seed.

                                                                                                                                                    +

                                                                                                                                                    +

                                                                                                                                                    +
                                                                                                                                                    cuda +

                                                                                                                                                    Saves the cuda random states too

                                                                                                                                                    +

                                                                                                                                                    + + DEFAULT: + is_available() + +

                                                                                                                                                    +
                                                                                                                                                    + + + + +
                                                                                                                                                    + + + + + + + + + + + +
                                                                                                                                                    + +
                                                                                                                                                    + +
                                                                                                                                                    + + +
                                                                                                                                                    + + + +

                                                                                                                                                    +get_random_generator_state + +

                                                                                                                                                    + + +
                                                                                                                                                    + +

                                                                                                                                                    Get the torch, numpy and random random generator state.

                                                                                                                                                    + + + + + + + + + + + + + + +
                                                                                                                                                    PARAMETERDESCRIPTION
                                                                                                                                                    cuda +

                                                                                                                                                    Saves the cuda random states too

                                                                                                                                                    +

                                                                                                                                                    + + DEFAULT: + is_available() + +

                                                                                                                                                    +
                                                                                                                                                    + + + + + + + + + + + + + + + + +
                                                                                                                                                    RETURNSDESCRIPTION
                                                                                                                                                    + + RandomGeneratorState + + +
                                                                                                                                                    + +
                                                                                                                                                    +
                                                                                                                                                    + +
                                                                                                                                                    + +
                                                                                                                                                    + +
                                                                                                                                                    + + + +

                                                                                                                                                    +set_random_generator_state + +

                                                                                                                                                    + + +
                                                                                                                                                    + +

                                                                                                                                                    Set the torch, numpy and random random generator state.

                                                                                                                                                    + + + + + + + + + + + + + + +
                                                                                                                                                    PARAMETERDESCRIPTION
                                                                                                                                                    state + +

                                                                                                                                                    +

                                                                                                                                                    +
                                                                                                                                                    + +
                                                                                                                                                    + +
                                                                                                                                                    + + + +
                                                                                                                                                    + +
                                                                                                                                                    + +
                                                                                                                                                    +

                                                                                                                                                      + + + + + + +
                                                                                                                                                      +
                                                                                                                                                      + + +
                                                                                                                                                      + +
                                                                                                                                                      + + + +
                                                                                                                                                      +
                                                                                                                                                      +
                                                                                                                                                      +
                                                                                                                                                      + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/utils/torch/index.html b/main/reference/edspdf/utils/torch/index.html new file mode 100644 index 00000000..ea83b420 --- /dev/null +++ b/main/reference/edspdf/utils/torch/index.html @@ -0,0 +1,2526 @@ + + + + + + + + + + + + + + + + + + + + + + torch - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                                      + +
                                                                                                                                                      + + + + + + + + +
                                                                                                                                                      + + +
                                                                                                                                                      + +
                                                                                                                                                      + + + + + + +
                                                                                                                                                      +
                                                                                                                                                      + + + +
                                                                                                                                                      +
                                                                                                                                                      +
                                                                                                                                                      + + + + +
                                                                                                                                                      +
                                                                                                                                                      +
                                                                                                                                                      + + + +
                                                                                                                                                      +
                                                                                                                                                      +
                                                                                                                                                      + + + +
                                                                                                                                                      +
                                                                                                                                                      +
                                                                                                                                                      + + + +
                                                                                                                                                      +
                                                                                                                                                      + + + + + + + +

                                                                                                                                                      edspdf.utils.torch

                                                                                                                                                      + + +
                                                                                                                                                      + + + + +
                                                                                                                                                      + + + +
                                                                                                                                                      + + + + + + + + + +
                                                                                                                                                      + + + +

                                                                                                                                                      +compute_pdf_relative_positions + +

                                                                                                                                                      + + +
                                                                                                                                                      + +

                                                                                                                                                      Compute relative positions between boxes. +Input boxes must be split between pages with the shape n_pages * n_boxes

                                                                                                                                                      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                                      PARAMETERDESCRIPTION
                                                                                                                                                      x0 + +

                                                                                                                                                      +

                                                                                                                                                      +
                                                                                                                                                      y0 + +

                                                                                                                                                      +

                                                                                                                                                      +
                                                                                                                                                      x1 + +

                                                                                                                                                      +

                                                                                                                                                      +
                                                                                                                                                      y1 + +

                                                                                                                                                      +

                                                                                                                                                      +
                                                                                                                                                      width + +

                                                                                                                                                      +

                                                                                                                                                      +
                                                                                                                                                      height + +

                                                                                                                                                      +

                                                                                                                                                      +
                                                                                                                                                      n_relative_positions +

                                                                                                                                                      Maximum range of embeddable relative positions between boxes (further +distances will be capped to ±n_relative_positions // 2)

                                                                                                                                                      +

                                                                                                                                                      +

                                                                                                                                                      +
                                                                                                                                                      + + + + + + + + + + + + + + + + +
                                                                                                                                                      RETURNSDESCRIPTION
                                                                                                                                                      + + LongTensor + + +
                                                                                                                                                      +

                                                                                                                                                      Shape: n_pages * n_boxes * n_boxes * 2

                                                                                                                                                      +
                                                                                                                                                      +
                                                                                                                                                      + +
                                                                                                                                                      + +
                                                                                                                                                      + + + +
                                                                                                                                                      + +
                                                                                                                                                      + +
                                                                                                                                                      +

                                                                                                                                                        + + + + + + +
                                                                                                                                                        +
                                                                                                                                                        + + +
                                                                                                                                                        + +
                                                                                                                                                        + + + +
                                                                                                                                                        +
                                                                                                                                                        +
                                                                                                                                                        +
                                                                                                                                                        + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/visualization/annotations/index.html b/main/reference/edspdf/visualization/annotations/index.html new file mode 100644 index 00000000..cf963b5c --- /dev/null +++ b/main/reference/edspdf/visualization/annotations/index.html @@ -0,0 +1,2636 @@ + + + + + + + + + + + + + + + + + + + + + + annotations - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                                        + +
                                                                                                                                                        + + + + + + + + +
                                                                                                                                                        + + +
                                                                                                                                                        + +
                                                                                                                                                        + + + + + + +
                                                                                                                                                        +
                                                                                                                                                        + + + +
                                                                                                                                                        +
                                                                                                                                                        +
                                                                                                                                                        + + + + +
                                                                                                                                                        +
                                                                                                                                                        +
                                                                                                                                                        + + + +
                                                                                                                                                        +
                                                                                                                                                        +
                                                                                                                                                        + + + +
                                                                                                                                                        +
                                                                                                                                                        +
                                                                                                                                                        + + + +
                                                                                                                                                        +
                                                                                                                                                        + + + + + + + +

                                                                                                                                                        edspdf.visualization.annotations

                                                                                                                                                        + + +
                                                                                                                                                        + + + + +
                                                                                                                                                        + + + +
                                                                                                                                                        + + + + + + + + + +
                                                                                                                                                        + + + +

                                                                                                                                                        +show_annotations + +

                                                                                                                                                        + + +
                                                                                                                                                        + +

                                                                                                                                                        Show Box annotations on a PDF document.

                                                                                                                                                        + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                                        PARAMETERDESCRIPTION
                                                                                                                                                        pdf +

                                                                                                                                                        Bytes content of the PDF document

                                                                                                                                                        +

                                                                                                                                                        + + TYPE: + bytes + +

                                                                                                                                                        +
                                                                                                                                                        annotations +

                                                                                                                                                        List of Box annotations to show

                                                                                                                                                        +

                                                                                                                                                        + + TYPE: + Sequence[Box] + +

                                                                                                                                                        +
                                                                                                                                                        colors +

                                                                                                                                                        Colors to use for each label. If a list is provided, it will be used to color +the first len(colors) unique labels. If a dictionary is provided, it will be +used to color the labels in the dictionary. If None, a default color scheme will +be used.

                                                                                                                                                        +

                                                                                                                                                        + + TYPE: + Optional[Union[Dict[str, str], List[str]]] + + + DEFAULT: + None + +

                                                                                                                                                        +
                                                                                                                                                        + + + + + + + + + + + + + + + + +
                                                                                                                                                        RETURNSDESCRIPTION
                                                                                                                                                        + + List[PpmImageFile] + + +
                                                                                                                                                        +

                                                                                                                                                        List of PIL images with the annotations. You can display them in a notebook +with display(*pages).

                                                                                                                                                        +
                                                                                                                                                        +
                                                                                                                                                        + +
                                                                                                                                                        + +
                                                                                                                                                        + +
                                                                                                                                                        + + + +

                                                                                                                                                        +compare_results + +

                                                                                                                                                        + + +
                                                                                                                                                        + +

                                                                                                                                                        Compare two sets of annotations on a PDF document.

                                                                                                                                                        + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                                        PARAMETERDESCRIPTION
                                                                                                                                                        pdf +

                                                                                                                                                        Bytes content of the PDF document

                                                                                                                                                        +

                                                                                                                                                        + + TYPE: + bytes + +

                                                                                                                                                        +
                                                                                                                                                        pred +

                                                                                                                                                        List of Box annotations to show on the left side

                                                                                                                                                        +

                                                                                                                                                        + + TYPE: + Sequence[Box] + +

                                                                                                                                                        +
                                                                                                                                                        gold +

                                                                                                                                                        List of Box annotations to show on the right side

                                                                                                                                                        +

                                                                                                                                                        + + TYPE: + Sequence[Box] + +

                                                                                                                                                        +
                                                                                                                                                        colors +

                                                                                                                                                        Colors to use for each label. If a list is provided, it will be used to color +the first len(colors) unique labels. If a dictionary is provided, it will be +used to color the labels in the dictionary. If None, a default color scheme will +be used.

                                                                                                                                                        +

                                                                                                                                                        + + TYPE: + Optional[Union[Dict[str, str], List[str]]] + + + DEFAULT: + None + +

                                                                                                                                                        +
                                                                                                                                                        + + + + + + + + + + + + + + + + +
                                                                                                                                                        RETURNSDESCRIPTION
                                                                                                                                                        + + List[PpmImageFile] + + +
                                                                                                                                                        +

                                                                                                                                                        List of PIL images with the annotations. You can display them in a notebook +with display(*pages).

                                                                                                                                                        +
                                                                                                                                                        +
                                                                                                                                                        + +
                                                                                                                                                        + +
                                                                                                                                                        + + + +
                                                                                                                                                        + +
                                                                                                                                                        + +
                                                                                                                                                        +

                                                                                                                                                          + + + + + + +
                                                                                                                                                          +
                                                                                                                                                          + + +
                                                                                                                                                          + +
                                                                                                                                                          + + + +
                                                                                                                                                          +
                                                                                                                                                          +
                                                                                                                                                          +
                                                                                                                                                          + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/visualization/index.html b/main/reference/edspdf/visualization/index.html new file mode 100644 index 00000000..5a25b8d4 --- /dev/null +++ b/main/reference/edspdf/visualization/index.html @@ -0,0 +1,2358 @@ + + + + + + + + + + + + + + + + + + + + + + visualization - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                                          + +
                                                                                                                                                          + + + + + + + + +
                                                                                                                                                          + + +
                                                                                                                                                          + +
                                                                                                                                                          + + + + + + +
                                                                                                                                                          +
                                                                                                                                                          + + + +
                                                                                                                                                          +
                                                                                                                                                          +
                                                                                                                                                          + + + + +
                                                                                                                                                          +
                                                                                                                                                          +
                                                                                                                                                          + + + +
                                                                                                                                                          +
                                                                                                                                                          +
                                                                                                                                                          + + + +
                                                                                                                                                          +
                                                                                                                                                          +
                                                                                                                                                          + + + +
                                                                                                                                                          +
                                                                                                                                                          + + + + + + + +

                                                                                                                                                          edspdf.visualization

                                                                                                                                                          + + +
                                                                                                                                                          + + + + +
                                                                                                                                                          + + + +
                                                                                                                                                          + + + + + + + + + + + +
                                                                                                                                                          + +
                                                                                                                                                          + +
                                                                                                                                                          +

                                                                                                                                                            + + + + + + +
                                                                                                                                                            +
                                                                                                                                                            + + +
                                                                                                                                                            + +
                                                                                                                                                            + + + +
                                                                                                                                                            +
                                                                                                                                                            +
                                                                                                                                                            +
                                                                                                                                                            + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/reference/edspdf/visualization/merge/index.html b/main/reference/edspdf/visualization/merge/index.html new file mode 100644 index 00000000..ff5a0ad9 --- /dev/null +++ b/main/reference/edspdf/visualization/merge/index.html @@ -0,0 +1,2481 @@ + + + + + + + + + + + + + + + + + + + + + + merge - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                                            + +
                                                                                                                                                            + + + + + + + + +
                                                                                                                                                            + + +
                                                                                                                                                            + +
                                                                                                                                                            + + + + + + +
                                                                                                                                                            +
                                                                                                                                                            + + + +
                                                                                                                                                            +
                                                                                                                                                            +
                                                                                                                                                            + + + + +
                                                                                                                                                            +
                                                                                                                                                            +
                                                                                                                                                            + + + +
                                                                                                                                                            +
                                                                                                                                                            +
                                                                                                                                                            + + + +
                                                                                                                                                            +
                                                                                                                                                            +
                                                                                                                                                            + + + +
                                                                                                                                                            +
                                                                                                                                                            + + + + + + + +

                                                                                                                                                            edspdf.visualization.merge

                                                                                                                                                            + + +
                                                                                                                                                            + + + + +
                                                                                                                                                            + + + +
                                                                                                                                                            + + + + + + + + + +
                                                                                                                                                            + + + +

                                                                                                                                                            +merge_boxes + +

                                                                                                                                                            + + +
                                                                                                                                                            + +

                                                                                                                                                            Recursively merge boxes that have the same label to form larger non-overlapping +boxes.

                                                                                                                                                            + + + + + + + + + + + + + + +
                                                                                                                                                            PARAMETERDESCRIPTION
                                                                                                                                                            boxes +

                                                                                                                                                            List of boxes to merge

                                                                                                                                                            +

                                                                                                                                                            + + TYPE: + Sequence[Box] + +

                                                                                                                                                            +
                                                                                                                                                            + + + + + + + + + + + + + + + + +
                                                                                                                                                            RETURNSDESCRIPTION
                                                                                                                                                            + + List[Box] + + +
                                                                                                                                                            +

                                                                                                                                                            List of merged boxes

                                                                                                                                                            +
                                                                                                                                                            +
                                                                                                                                                            + +
                                                                                                                                                            + +
                                                                                                                                                            + + + +
                                                                                                                                                            + +
                                                                                                                                                            + +
                                                                                                                                                            +

                                                                                                                                                              + + + + + + +
                                                                                                                                                              +
                                                                                                                                                              + + +
                                                                                                                                                              + +
                                                                                                                                                              + + + +
                                                                                                                                                              +
                                                                                                                                                              +
                                                                                                                                                              +
                                                                                                                                                              + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/references.bib b/main/references.bib new file mode 100644 index 00000000..5c20fd1b --- /dev/null +++ b/main/references.bib @@ -0,0 +1,7 @@ +@article{vaswani2017attention, + title={Attention is all you need}, + author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, + journal={Advances in neural information processing systems}, + volume={30}, + year={2017} +} diff --git a/main/roadmap/index.html b/main/roadmap/index.html new file mode 100644 index 00000000..1d2544a8 --- /dev/null +++ b/main/roadmap/index.html @@ -0,0 +1,2347 @@ + + + + + + + + + + + + + + + + + + + + Roadmap - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                                              + +
                                                                                                                                                              + + + + + + + + +
                                                                                                                                                              + + +
                                                                                                                                                              + +
                                                                                                                                                              + + + + + + +
                                                                                                                                                              +
                                                                                                                                                              + + + +
                                                                                                                                                              +
                                                                                                                                                              +
                                                                                                                                                              + + + + +
                                                                                                                                                              +
                                                                                                                                                              +
                                                                                                                                                              + + + +
                                                                                                                                                              +
                                                                                                                                                              +
                                                                                                                                                              + + + +
                                                                                                                                                              +
                                                                                                                                                              +
                                                                                                                                                              + + + +
                                                                                                                                                              +
                                                                                                                                                              + + + + + + + +

                                                                                                                                                              Roadmap

                                                                                                                                                              +
                                                                                                                                                                +
                                                                                                                                                              • Style extraction
                                                                                                                                                              • +
                                                                                                                                                              • Custom hybrid torch-based pipeline & configuration system
                                                                                                                                                              • +
                                                                                                                                                              • Drop pandas DataFrame in favour of a ~~Cython~~ attr wrapper around PDF documents?
                                                                                                                                                              • +
                                                                                                                                                              • Add training capabilities with a CLI to automate the annotation/preparation/training loop. + Again, draw inspiration from spaCy, and maybe add the notion of a TrainableClassifier...
                                                                                                                                                              • +
                                                                                                                                                              • Add complete serialisation capabilities, to save a full pipeline to disk. + Draw inspiration from spaCy, which took great care to solve these issues: + add save and load methods to every pipeline component
                                                                                                                                                              • +
                                                                                                                                                              • Multiple-column extraction
                                                                                                                                                              • +
                                                                                                                                                              • Table detector
                                                                                                                                                              • +
                                                                                                                                                              • Integrate third-party OCR module
                                                                                                                                                              • +
                                                                                                                                                              +

                                                                                                                                                                + + + + + + +
                                                                                                                                                                +
                                                                                                                                                                + + +
                                                                                                                                                                + +
                                                                                                                                                                + + + +
                                                                                                                                                                +
                                                                                                                                                                +
                                                                                                                                                                +
                                                                                                                                                                + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/scripts/__pycache__/bibtex.cpython-311.pyc b/main/scripts/__pycache__/bibtex.cpython-311.pyc new file mode 100644 index 00000000..0efa6c9a Binary files /dev/null and b/main/scripts/__pycache__/bibtex.cpython-311.pyc differ diff --git a/main/scripts/__pycache__/plugin.cpython-311.pyc b/main/scripts/__pycache__/plugin.cpython-311.pyc new file mode 100644 index 00000000..e993b343 Binary files /dev/null and b/main/scripts/__pycache__/plugin.cpython-311.pyc differ diff --git a/main/scripts/bibtex.py b/main/scripts/bibtex.py new file mode 100644 index 00000000..c3f9a36b --- /dev/null +++ b/main/scripts/bibtex.py @@ -0,0 +1,287 @@ +# Based on https://github.com/darwindarak/mdx_bib +import re +import string +from collections import Counter, OrderedDict +from typing import Tuple +from xml.etree import ElementTree as etree +from xml.etree.ElementTree import tostring as etree_to_string + +from markdown.extensions import Extension +from markdown.inlinepatterns import Pattern +from markdown.preprocessors import Preprocessor +from mkdocs.config.config_options import Type as MkType +from mkdocs.plugins import BasePlugin +from pybtex.database.input import bibtex +from pybtex.exceptions import PybtexError + +BRACKET_RE = re.compile(r"\[([^\[]+)\]") +CITE_RE = re.compile(r"@(\w+)") +DEF_RE = re.compile(r"\A {0,3}\[@(\w+)\]:\s*(.*)") +INDENT_RE = re.compile(r"\A\t| {4}(.*)") + +CITATION_RE = r"(\[@(?:\w+)(?: *, *@(?:\w+))*\])" + + +class Bibliography(object): + """Keep track of document references and citations for exporting""" + + def __init__(self, extension, plugin, bibtex_file, order): + self.extension = extension + self.order = order + self.plugin = plugin + + self.citations = OrderedDict() + self.references = dict() + + if bibtex_file: + try: + parser = bibtex.Parser() + self.bibsource = parser.parse_file(bibtex_file).entries + self.labels = { + id: self.formatCitation(self.bibsource[id]) + for id in self.bibsource.keys() + } + for value, occurrences in Counter(self.labels.values()).items(): + if occurrences > 1: + for xkey, xvalue in self.labels.items(): + i = 0 + if xvalue == value: + self.labels[ + xkey + ] = f"{xvalue}{string.ascii_lowercase[i]}" + i += 1 + + except PybtexError: + print("Error loading bibtex file") + self.bibsource = dict() + self.labels = {} + else: + self.bibsource = dict() + + def addCitation(self, citekey): + self.citations[citekey] = self.citations.get(citekey, 0) + 1 + + def setReference(self, citekey, reference): + self.references[citekey] = reference + + def citationID(self, citekey): + return "cite-" + citekey + + def referenceID(self, citekey): + return "ref-" + citekey + + def formatAuthor(self, author): + out = f"{author.last_names[0]} {author.first_names[0][0]}." + if author.middle_names: + out += f"{author.middle_names[0][0]}." + return out.replace("{", "").replace("}", "") + + def formatAuthorSurname(self, author): + out = author.last_names[0] + return out.replace("{", "").replace("}", "") + + def formatReference(self, ref): + author_list = list(map(self.formatAuthor, ref.persons["author"])) + + if len(author_list) == 1: + authors = author_list[0] + else: + authors = ", ".join(author_list[:-1]) + authors += f" and {author_list[-1]}" + + # Harvard style + # Surname, Initial, ... and Last_Surname, + # Initial, Year. Title. Journal, Volume(Issue), pages. doi. + + title = ref.fields["title"].replace("{", "").replace("}", "") + journal = ref.fields.get("journal", "") + volume = ref.fields.get("volume", "") + issue = ref.fields.get("issue", "") + year = ref.fields.get("year") + pages = ref.fields.get("pages") + doi = ref.fields.get("doi") + + ref_id = self.referenceID(ref.key) + reference = f"

                                                                                                                                                                {authors}, {year}. {title}." + if journal: + reference += f" {journal}." + if volume: + reference += f" {volume}" + if issue: + reference += f"({issue})" + if pages: + reference += f", pp.{pages}" + reference += "." + if doi: + reference += ( + f' {doi}' + ) + reference += "

                                                                                                                                                                " + + return etree.fromstring(reference) + + def formatCitation(self, ref): + author_list = list(map(self.formatAuthorSurname, ref.persons["author"])) + year = ref.fields.get("year") + + if len(author_list) == 1: + citation = f"{author_list[0]}" + elif len(author_list) == 2: + citation = f"{author_list[0]} and {author_list[1]}" + else: + citation = f"{author_list[0]} et al." + + citation += f", {year}" + + return citation + + def make_bibliography(self): + if self.order == "alphabetical": + raise (NotImplementedError) + + div = etree.Element("div") + div.set("class", "footnote") + div.append(etree.Element("hr")) + ol = etree.SubElement(div, "ol") + + if not self.citations: + return div + + # table = etree.SubElement(div, "table") + # table.set("class", "references") + # tbody = etree.SubElement(table, "tbody") + etree.SubElement(div, "div") + for id in self.citations: + li = etree.SubElement(ol, "li") + li.set("id", self.referenceID(id)) + # ref_id = etree.SubElement(li, "td") + ref_txt = etree.SubElement(li, "p") + if id in self.references: + self.extension.parser.parseChunk(ref_txt, self.references[id]) + elif id in self.bibsource: + ref_txt.append(self.formatReference(self.bibsource[id])) + else: + ref_txt.text = "Missing citation" + + return div + + def clear_citations(self): + self.citations = OrderedDict() + + +class CitationsPreprocessor(Preprocessor): + """Gather reference definitions and citation keys""" + + def __init__(self, bibliography): + self.bib = bibliography + + def subsequentIndents(self, lines, i): + """Concatenate consecutive indented lines""" + linesOut = [] + while i < len(lines): + m = INDENT_RE.match(lines[i]) + if m: + linesOut.append(m.group(1)) + i += 1 + else: + break + return " ".join(linesOut), i + + def run(self, lines): + linesOut = [] + i = 0 + + while i < len(lines): + # Check to see if the line starts a reference definition + m = DEF_RE.match(lines[i]) + if m: + key = m.group(1) + reference = m.group(2) + indents, i = self.subsequentIndents(lines, i + 1) + reference += " " + indents + + self.bib.setReference(key, reference) + continue + + # Look for all @citekey patterns inside hard brackets + for bracket in BRACKET_RE.findall(lines[i]): + for c in CITE_RE.findall(bracket): + self.bib.addCitation(c) + linesOut.append(lines[i]) + i += 1 + + return linesOut + + +class CitationsPattern(Pattern): + """Handles converting citations keys into links""" + + def __init__(self, pattern, bibliography): + super(CitationsPattern, self).__init__(pattern) + self.bib = bibliography + + def handleMatch(self, m): + span = etree.Element("span") + for cite_match in CITE_RE.finditer(m.group(2)): + id = cite_match.group(1) + if id in self.bib.bibsource: + a = etree.Element("a") + a.set("id", self.bib.citationID(id)) + a.set("href", "./#" + self.bib.referenceID(id)) + a.set("class", "citation") + a.text = self.bib.labels[id] + span.append(a) + else: + continue + if len(span) == 0: + return None + return span + + +context_citations = None + + +class CitationsExtension(Extension): + def __init__(self): + super(CitationsExtension, self).__init__() + self.bib = None + + def extendMarkdown(self, md): + md.registerExtension(self) + self.parser = md.parser + self.md = md + + md.preprocessors.register(CitationsPreprocessor(self.bib), "mdx_bib", 15) + md.inlinePatterns.register( + CitationsPattern(CITATION_RE, self.bib), "mdx_bib", 175 + ) + + +def makeExtension(*args, **kwargs): + return CitationsExtension(*args, **kwargs) + + +class BibTexPlugin(BasePlugin): + config_scheme: Tuple[Tuple[str, MkType]] = ( + ("bibtex_file", MkType(str)), # type: ignore[assignment] + ("order", MkType(str, default="unsorted")), # type: ignore[assignment] + ) + + def __init__(self): + self.citations = None + + def on_config(self, config, **kwargs): + extension = CitationsExtension() + self.bib = Bibliography( + extension, + self, + self.config["bibtex_file"], + self.config["order"], + ) + extension.bib = self.bib + config["markdown_extensions"].append(extension) + + def on_page_content(self, html, page, config, files): + html += "\n" + etree_to_string(self.bib.make_bibliography()).decode() + self.bib.clear_citations() + return html diff --git a/main/scripts/plugin.py b/main/scripts/plugin.py new file mode 100644 index 00000000..4c5afb7d --- /dev/null +++ b/main/scripts/plugin.py @@ -0,0 +1,92 @@ +import os +import shutil +from pathlib import Path + +import mkdocs + +# Add the files from the project root + +# Generate the code reference pages and navigation. +doc_reference = Path("docs/reference") +shutil.rmtree(doc_reference, ignore_errors=True) +os.makedirs(doc_reference, exist_ok=True) +root = Path("edspdf") +for path in sorted(root.rglob("*.py")): + if "poppler_src" in str(path): + continue + module_path = path.relative_to(root.parent).with_suffix("") + doc_path = path.relative_to(root.parent).with_suffix(".md") + full_doc_path = doc_reference / doc_path + parts = list(module_path.parts) + if parts[-1] == "__init__": + parts = parts[:-1] + doc_path = doc_path.with_name("index.md") + full_doc_path = full_doc_path.with_name("index.md") + elif parts[-1] == "__main__": + continue + ident = ".".join(parts) + os.makedirs(full_doc_path.parent, exist_ok=True) + with open(full_doc_path, "w") as fd: + print(f"# `{ident}`\n", file=fd) + print("::: " + ident, file=fd) + if root != "edspdf": + print(" options:", file=fd) + print(" show_source: false", file=fd) + + +def on_files(files: mkdocs.structure.files.Files, config: mkdocs.config.Config): + """ + Recursively the navigation of the mkdocs config + and recursively content of directories of page that point + to directories. + + Parameters + ---------- + config: mkdocs.config.Config + The configuration object + kwargs: dict + Additional arguments + """ + + def get_nested_files(path): + files = [] + for file in path.iterdir(): + if file.is_dir(): + index = file / "index.md" + if index.exists(): + # Get name from h1 heading in index + name = index.read_text().split("\n")[0].strip("# ") + if name.startswith("`edspdf"): + name = name[1:-1].split(".")[-1] + files.append({name: get_nested_files(file)}) + else: + title = file.name.replace("_", " ").replace("-", " ").title() + files.append({title: get_nested_files(file)}) + else: + name = file.read_text().split("\n")[0].strip("# ") + if name.startswith("`edspdf"): + name = name[1:-1].split(".")[-1] + files.append({name: str(file.relative_to(config["docs_dir"]))}) + else: + files.append(str(file.relative_to(config["docs_dir"]))) + return files + + def rec(tree): + if isinstance(tree, list): + return [rec(item) for item in tree] + elif isinstance(tree, dict): + return {k: rec(item) for k, item in tree.items()} + elif isinstance(tree, str): + if tree.endswith("/"): + # We have a directory + path = Path(config["docs_dir"]) / tree + if path.is_dir(): + return get_nested_files(path) + else: + return tree + else: + return tree + else: + return tree + + config["nav"] = rec(config["nav"]) diff --git a/main/search/search_index.json b/main/search/search_index.json new file mode 100644 index 00000000..3bb1dbad --- /dev/null +++ b/main/search/search_index.json @@ -0,0 +1 @@ +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Overview","text":"

                                                                                                                                                                EDS-PDF provides modular framework to extract text information from PDF documents.

                                                                                                                                                                You can use it out-of-the-box, or extend it to fit your use-case.

                                                                                                                                                                "},{"location":"#getting-started","title":"Getting started","text":""},{"location":"#installation","title":"Installation","text":"

                                                                                                                                                                Install the library with pip:

                                                                                                                                                                $ pip install edspdf\n---> 100%\ncolor:green Installation successful\n
                                                                                                                                                                "},{"location":"#extracting-text","title":"Extracting text","text":"

                                                                                                                                                                Let's build a simple PDF extractor that uses a rule-based classifier. There are two ways to do this, either by using the configuration system or by using the pipeline API.

                                                                                                                                                                Configuration based pipelineAPI based pipeline

                                                                                                                                                                Create a configuration file:

                                                                                                                                                                config.cfg
                                                                                                                                                                [pipeline]\npipeline = [\"extractor\", \"classifier\", \"aggregator\"]\n\n[components.extractor]\n@factory = \"pdfminer-extractor\"\n\n[components.classifier]\n@factory = \"mask-classifier\"\nx0 = 0.2\nx1 = 0.9\ny0 = 0.3\ny1 = 0.6\nthreshold = 0.1\n\n[components.aggregator]\n@factory = \"simple-aggregator\"\n

                                                                                                                                                                and load it from Python:

                                                                                                                                                                import edspdf\nfrom pathlib import Path\n\nmodel = edspdf.load(\"config.cfg\")  # (1)\n

                                                                                                                                                                Or create a pipeline directly from Python:

                                                                                                                                                                from edspdf import Pipeline\n\nmodel = Pipeline()\nmodel.add_pipe(\"pdfminer-extractor\")\nmodel.add_pipe(\n    \"mask-classifier\",\n    config=dict(\n        x0=0.2,\n        x1=0.9,\n        y0=0.3,\n        y1=0.6,\n        threshold=0.1,\n    ),\n)\nmodel.add_pipe(\"simple-aggregator\")\n

                                                                                                                                                                This pipeline can then be applied (for instance with this PDF):

                                                                                                                                                                # Get a PDF\npdf = Path(\"/Users/perceval/Development/edspdf/tests/resources/letter.pdf\").read_bytes()\npdf = model(pdf)\n\nbody = pdf.aggregated_texts[\"body\"]\n\ntext, style = body.text, body.properties\n

                                                                                                                                                                See the rule-based recipe for a step-by-step explanation of what is happening.

                                                                                                                                                                "},{"location":"#citation","title":"Citation","text":"

                                                                                                                                                                If you use EDS-PDF, please cite us as below.

                                                                                                                                                                @software{edspdf,\nauthor  = {Dura, Basile and Wajsburt, Perceval and Calliger, Alice and G\u00e9rardin, Christel and Bey, Romain},\ndoi     = {10.5281/zenodo.6902977},\nlicense = {BSD-3-Clause},\ntitle   = {{EDS-PDF: Smart text extraction from PDF documents}},\nurl     = {https://github.com/aphp/edspdf}\n}\n
                                                                                                                                                                "},{"location":"#acknowledgement","title":"Acknowledgement","text":"

                                                                                                                                                                We would like to thank Assistance Publique \u2013 H\u00f4pitaux de Paris and AP-HP Foundation for funding this project.

                                                                                                                                                                "},{"location":"alternatives/","title":"Alternatives & Comparison","text":"

                                                                                                                                                                EDS-PDF was developed to propose a more modular and extendable approach to PDF extraction than PDFBox, the legacy implementation at APHP's clinical data warehouse.

                                                                                                                                                                EDS-PDF takes inspiration from Explosion's spaCy pipelining system and closely follows its API. Therefore, the core object within EDS-PDF is the Pipeline, which organises the processing of PDF documents into multiple components. However, unlike spaCy, the library is built around a single deep learning framework, pytorch, which makes model development easier.

                                                                                                                                                                "},{"location":"changelog/","title":"Changelog","text":""},{"location":"changelog/#v080","title":"v0.8.0","text":""},{"location":"changelog/#added","title":"Added","text":"
                                                                                                                                                                • Add multi-modal transformers (huggingface-embedding) with windowing options
                                                                                                                                                                • Add render_page option to pdfminer extractor, for multi-modal PDF features
                                                                                                                                                                • Add inference utilities (accelerators), with simple mono process support and multi gpu / cpu support
                                                                                                                                                                • Packaging utils (pipeline.package(...)) to make a pip installable package from a pipeline
                                                                                                                                                                "},{"location":"changelog/#changed","title":"Changed","text":"
                                                                                                                                                                • Updated API to follow EDS-NLP's refactoring
                                                                                                                                                                • Updated confit to 0.4.2 (better errors) and foldedtensor to 0.3.0 (better multiprocess support)
                                                                                                                                                                • Removed pipeline.score. You should use pipeline.pipe, a custom scorer and pipeline.select_pipes instead.
                                                                                                                                                                • Better test coverage
                                                                                                                                                                • Use hatch instead of setuptools to build the package / docs and run the tests
                                                                                                                                                                "},{"location":"changelog/#fixed","title":"Fixed","text":"
                                                                                                                                                                • Fixed attrs dependency only being installed in dev mode
                                                                                                                                                                "},{"location":"changelog/#v070","title":"v0.7.0","text":"

                                                                                                                                                                Major refactoring of the library:

                                                                                                                                                                "},{"location":"changelog/#core-features","title":"Core features","text":"
                                                                                                                                                                • new pipeline system whose API is inspired by spaCy
                                                                                                                                                                • first-class support for pytorch
                                                                                                                                                                • hybrid model inference and training (rules + deep learning)
                                                                                                                                                                • moved from pandas DataFrame to attrs dataclasses (PDFDoc, Page, Box, ...) for representing PDF documents
                                                                                                                                                                • new configuration system based on [config][https://github.com/aphp/config], with support for instantiation of complex deep learning models, off-the-shelf CLI, ...
                                                                                                                                                                "},{"location":"changelog/#functional-features","title":"Functional features","text":"
                                                                                                                                                                • new extractors: pymupdf and poppler (separate packages for licensing reasons)
                                                                                                                                                                • many deep learning layers (box-transformer, 2d attention with relative position information, ...)
                                                                                                                                                                • trainable deep learning classifier
                                                                                                                                                                • training recipes for deep learning models
                                                                                                                                                                "},{"location":"changelog/#v063-2023-01-23","title":"v0.6.3 - 2023-01-23","text":""},{"location":"changelog/#fixed_1","title":"Fixed","text":"
                                                                                                                                                                • Allow corrupted PDF to not raise an error by default (they are treated as empty PDFs)
                                                                                                                                                                • Fix classification and aggregation for empty PDFs
                                                                                                                                                                "},{"location":"changelog/#v062-2022-12-07","title":"v0.6.2 - 2022-12-07","text":"

                                                                                                                                                                Cast bytes-like extractor inputs as bytes

                                                                                                                                                                "},{"location":"changelog/#v061-2022-12-07","title":"v0.6.1 - 2022-12-07","text":"

                                                                                                                                                                Performance and cuda related fixes.

                                                                                                                                                                "},{"location":"changelog/#v060-2022-12-05","title":"v0.6.0 - 2022-12-05","text":"

                                                                                                                                                                Many, many changes: - added torch as the main deep learning framework instead of spaCy and thinc - added poppler and mupdf as alternatives to pdfminer - new pipeline / config / registry system to facilitate consistency between training and inference - standardization of the exchange format between components with dataclass models (attrs more specifically) instead of pandas dataframes

                                                                                                                                                                "},{"location":"changelog/#v053-2022-08-31","title":"v0.5.3 - 2022-08-31","text":""},{"location":"changelog/#added_1","title":"Added","text":"
                                                                                                                                                                • Add label mapping parameter to aggregators (to merge different types of blocks such as title and body)
                                                                                                                                                                • Improved line aggregation formula
                                                                                                                                                                "},{"location":"changelog/#v052-2022-08-30","title":"v0.5.2 - 2022-08-30","text":""},{"location":"changelog/#fixed_2","title":"Fixed","text":"
                                                                                                                                                                • Fix aggregation for empty documents
                                                                                                                                                                "},{"location":"changelog/#v051-2022-07-26","title":"v0.5.1 - 2022-07-26","text":""},{"location":"changelog/#changed_1","title":"Changed","text":"
                                                                                                                                                                • Drop the pdf2image dependency, replacing it with pypdfium2 (easier installation)
                                                                                                                                                                "},{"location":"changelog/#v050-2022-07-25","title":"v0.5.0 - 2022-07-25","text":""},{"location":"changelog/#changed_2","title":"Changed","text":"
                                                                                                                                                                • Major refactoring of the library. Moved from concepts (aggregation) to plural names (aggregators).
                                                                                                                                                                "},{"location":"changelog/#v043-2022-07-20","title":"v0.4.3 - 2022-07-20","text":""},{"location":"changelog/#fixed_3","title":"Fixed","text":"
                                                                                                                                                                • Multi page boxes alignment
                                                                                                                                                                "},{"location":"changelog/#v042-2022-07-06","title":"v0.4.2 - 2022-07-06","text":""},{"location":"changelog/#added_2","title":"Added","text":"
                                                                                                                                                                • package-resource.v1 in the misc registry
                                                                                                                                                                "},{"location":"changelog/#v041-2022-06-14","title":"v0.4.1 - 2022-06-14","text":""},{"location":"changelog/#fixed_4","title":"Fixed","text":"
                                                                                                                                                                • Remove importlib.metadata dependency, which led to issues with Python 3.7
                                                                                                                                                                "},{"location":"changelog/#v040-2022-06-14","title":"v0.4.0 - 2022-06-14","text":""},{"location":"changelog/#added_3","title":"Added","text":"
                                                                                                                                                                • Python 3.7 support, by relaxing dependency constraints
                                                                                                                                                                • Support for package-resource pipeline for sklearn-pipeline.v1
                                                                                                                                                                "},{"location":"changelog/#v032-2022-06-03","title":"v0.3.2 - 2022-06-03","text":""},{"location":"changelog/#added_4","title":"Added","text":"
                                                                                                                                                                • compare_results in visualisation
                                                                                                                                                                "},{"location":"changelog/#v031-2022-06-02","title":"v0.3.1 - 2022-06-02","text":""},{"location":"changelog/#fixed_5","title":"Fixed","text":"
                                                                                                                                                                • Rescale transform now keeps origin on top-left corner
                                                                                                                                                                "},{"location":"changelog/#v030-2022-06-01","title":"v0.3.0 - 2022-06-01","text":""},{"location":"changelog/#added_5","title":"Added","text":"
                                                                                                                                                                • Styles management within the extractor
                                                                                                                                                                • styled.v1 aggregator, to handle styles
                                                                                                                                                                • rescale.v1 transform, to go back to the original height and width
                                                                                                                                                                "},{"location":"changelog/#changed_3","title":"Changed","text":"
                                                                                                                                                                • Styles and text extraction is handled by the extractor directly
                                                                                                                                                                • The PDFMiner line object is not carried around any more
                                                                                                                                                                "},{"location":"changelog/#removed","title":"Removed","text":"
                                                                                                                                                                • Outdated params entry in the EDS-PDF registry.
                                                                                                                                                                "},{"location":"changelog/#v022-2022-05-12","title":"v0.2.2 - 2022-05-12","text":""},{"location":"changelog/#changed_4","title":"Changed","text":"
                                                                                                                                                                • Fixed merge_lines bug when lines were empty
                                                                                                                                                                • Modified the demo consequently
                                                                                                                                                                "},{"location":"changelog/#v021-2022-05-09","title":"v0.2.1 - 2022-05-09","text":""},{"location":"changelog/#changed_5","title":"Changed","text":"
                                                                                                                                                                • The extractor always returns a pandas DataFrame, be it empty. It enhances robustness and stability.
                                                                                                                                                                "},{"location":"changelog/#v020-2022-05-09","title":"v0.2.0 - 2022-05-09","text":""},{"location":"changelog/#added_6","title":"Added","text":"
                                                                                                                                                                • aggregation submodule to handle the specifics of aggregating text blocs
                                                                                                                                                                • Base classes for better-defined modules
                                                                                                                                                                • Uniformise the columns to labels
                                                                                                                                                                • Add arbitrary contextual information
                                                                                                                                                                "},{"location":"changelog/#removed_1","title":"Removed","text":"
                                                                                                                                                                • typer legacy dependency
                                                                                                                                                                • models submodule, which handled the configurations for Spark distribution (deferred to another package)
                                                                                                                                                                • specific orbis context, which was APHP-specific
                                                                                                                                                                "},{"location":"changelog/#v010-2022-05-06","title":"v0.1.0 - 2022-05-06","text":"

                                                                                                                                                                Inception !

                                                                                                                                                                "},{"location":"changelog/#features","title":"Features","text":"
                                                                                                                                                                • spaCy-like configuration system
                                                                                                                                                                • Available classifiers :
                                                                                                                                                                • dummy.v1, that classifies everything to body
                                                                                                                                                                • mask.v1, for simple rule-based classification
                                                                                                                                                                • sklearn.v1, that uses a Scikit-Learn pipeline
                                                                                                                                                                • random.v1, to better sow chaos
                                                                                                                                                                • Merge different blocs together for easier visualisation
                                                                                                                                                                • Streamlit demo with visualisation
                                                                                                                                                                "},{"location":"configuration/","title":"Configuration","text":"

                                                                                                                                                                EDS-PDF is built on top of the confit configuration system.

                                                                                                                                                                The following catalogue registries are included within EDS-PDF:

                                                                                                                                                                Section Description factory Components factories (most often classes) adapter Raw data preprocessing functions

                                                                                                                                                                EDS-PDF pipelines are meant to be reproducible and serializable, such that you can always define a pipeline through the configuration system.

                                                                                                                                                                To wit, compare the API-based approach to the configuration-based approach (the two are strictly equivalent):

                                                                                                                                                                API-basedConfiguration-based
                                                                                                                                                                import edspdf\nfrom pathlib import Path\n\nmodel = edspdf.Pipeline()\nmodel.add_pipe(\"pdfminer-extractor\", name=\"extractor\")\nmodel.add_pipe(\"mask-classifier\", name=\"classifier\", config=dict(\nx0=0.2,\nx1=0.9,\ny0=0.3,\ny1=0.6,\nthreshold=0.1,\n)\nmodel.add_pipe(\"simple-aggregator\", name=\"aggregator\")\n# Get a PDF\npdf = Path(\"letter.pdf\").read_bytes()\n\npdf = model(pdf)\n\nstr(pdf.aggregated_texts[\"body\"])\n# Out: Cher Pr ABC, Cher DEF,\\n...\n
                                                                                                                                                                config.cfg
                                                                                                                                                                [pipeline]\npipeline = [\"extractor\", \"classifier\", \"aggregator\"]\n\n[components.extractor]\n@factory = \"pdfminer-extractor\"\n\n[components.classifier]\n@factory = \"mask-classifier\"\nx0 = 0.2\nx1 = 0.9\ny0 = 0.3\ny1 = 0.6\nthreshold = 0.1\n\n[components.aggregator]\n@factory = \"simple-aggregator\"\n
                                                                                                                                                                import edspdf\nfrom pathlib import Path\n\npipeline = edspdf.load(\"config.cfg\")\n# Get a PDF\npdf = Path(\"letter.pdf\").read_bytes()\n\npdf = pipeline(pdf)\n\nstr(pdf.aggregated_texts[\"body\"])\n# Out: Cher Pr ABC, Cher DEF,\\n...\n

                                                                                                                                                                The configuration-based approach strictly separates the definition of the pipeline to its application and avoids tucking away important configuration details. Changes to the pipeline are transparent as there is a single source of truth: the configuration file.

                                                                                                                                                                "},{"location":"contributing/","title":"Contributing to EDS-PDF","text":"

                                                                                                                                                                We welcome contributions ! There are many ways to help. For example, you can:

                                                                                                                                                                1. Help us track bugs by filing issues
                                                                                                                                                                2. Suggest and help prioritise new functionalities
                                                                                                                                                                3. Help us make the library as straightforward as possible, by simply asking questions on whatever does not seem clear to you.
                                                                                                                                                                "},{"location":"contributing/#development-installation","title":"Development installation","text":"

                                                                                                                                                                To be able to run the test suite and develop your own pipeline, you should clone the repo and install it locally. We use the hatch package manager to manage the project.

                                                                                                                                                                color:gray # Clone the repository and change directory\n$ git clone ssh://git@github.com/aphp/edspdf.git\n---> 100%\n\ncolor:gray # Ensure hatch is installed, preferably via pipx\n$ pipx install hatch\n\n$ cd edspdf\n\ncolor:gray # Enter a shell to develop / test the project. This will install everything required in a virtual environment. You can also `source` the path shown by hatch.\n$ hatch shell\n$ ...\n$ exit  # when you're done\n

                                                                                                                                                                To make sure the pipeline will not fail because of formatting errors, we added pre-commit hooks using the pre-commit Python library. To use it, simply install it:

                                                                                                                                                                $ pre-commit install\n

                                                                                                                                                                The pre-commit hooks defined in the configuration will automatically run when you commit your changes, letting you know if something went wrong.

                                                                                                                                                                The hooks only run on staged changes. To force-run it on all files, run:

                                                                                                                                                                $ pre-commit run --all-files\n---> 100%\ncolor:green All good !\n
                                                                                                                                                                "},{"location":"contributing/#proposing-a-merge-request","title":"Proposing a merge request","text":"

                                                                                                                                                                At the very least, your changes should :

                                                                                                                                                                • Be well-documented ;
                                                                                                                                                                • Pass every tests, and preferably implement its own ;
                                                                                                                                                                • Follow the style guide.
                                                                                                                                                                "},{"location":"contributing/#testing-your-code","title":"Testing your code","text":"

                                                                                                                                                                We use the Pytest test suite.

                                                                                                                                                                The following command will run the test suite. Writing your own tests is encouraged !

                                                                                                                                                                pytest\n

                                                                                                                                                                Should your contribution propose a bug fix, we require the bug be thoroughly tested.

                                                                                                                                                                "},{"location":"contributing/#style-guide","title":"Style Guide","text":"

                                                                                                                                                                We use Black to reformat the code. While other formatter only enforce PEP8 compliance, Black also makes the code uniform. In short :

                                                                                                                                                                Black reformats entire files in place. It is not configurable.

                                                                                                                                                                Moreover, the CI/CD pipeline enforces a number of checks on the \"quality\" of the code. To wit, non black-formatted code will make the test pipeline fail. We use pre-commit to keep our codebase clean.

                                                                                                                                                                Refer to the development install tutorial for tips on how to format your files automatically. Most modern editors propose extensions that will format files on save.

                                                                                                                                                                "},{"location":"contributing/#documentation","title":"Documentation","text":"

                                                                                                                                                                Make sure to document your improvements, both within the code with comprehensive docstrings, as well as in the documentation itself if need be.

                                                                                                                                                                We use MkDocs for EDS-PDF's documentation. You can view your changes with

                                                                                                                                                                color:gray # Run the documentation\n$ hatch run docs:serve\n

                                                                                                                                                                Go to localhost:8000 to see your changes. MkDocs watches for changes in the documentation folder and automatically reloads the page.

                                                                                                                                                                "},{"location":"data-structures/","title":"Data Structures","text":"

                                                                                                                                                                EDS-PDF stores PDFs and their annotation in a custom data structures that are designed to be easy to use and manipulate. We must distinguish between:

                                                                                                                                                                • the data models used to store the PDFs and exchange them between the different components of EDS-PDF
                                                                                                                                                                • the tensors structures used to process the PDFs with deep learning models
                                                                                                                                                                "},{"location":"data-structures/#itinerary-of-a-pdf","title":"Itinerary of a PDF","text":"

                                                                                                                                                                A PDF is first converted to a PDFDoc object, which contains the raw PDF content. This task is usually performed a PDF extractor component. Once the PDF is converted, the same object will be used and updated by the different components, and returned at the end of the pipeline.

                                                                                                                                                                When running a trainable component, the PDFDoc is preprocessed and converted to tensors containing relevant features for the task. This task is performed in the preprocess method of the component. The resulting tensors are then collated together to form a batch, in the collate method of the component. After running the forward method of the component, the tensor predictions are finally assigned as annotations to original PDFDoc objects in the postprocess method.

                                                                                                                                                                "},{"location":"data-structures/#data-models","title":"Data models","text":"

                                                                                                                                                                The main data structure is the [PDFDoc][edspdf.structures.PDFDoc], which represents full a PDF document. It contains the raw PDF content, annotations for the full document, regardless of pages. A PDF is split into Page objects that stores their number, dimension and optionally an image of the rendered page.

                                                                                                                                                                The PDF annotations are stored in Box objects, which represent a rectangular region of the PDF. At the moment, box can only be specialized into TextBox to represent text regions, such as lines extracted by a PDF extractor. Aggregated texts are stored in Text objects, that are not associated with a specific box.

                                                                                                                                                                A TextBox contains a list of TextProperties objects to store the style properties of a styled spans of the text.

                                                                                                                                                                Reference"},{"location":"data-structures/#edspdf.structures.PDFDoc","title":"PDFDoc","text":"

                                                                                                                                                                Bases: BaseModel

                                                                                                                                                                This is the main data structure of the library to hold PDFs. It contains the content of the PDF, as well as box annotations and text outputs.

                                                                                                                                                                ATTRIBUTE DESCRIPTION content

                                                                                                                                                                The content of the PDF document.

                                                                                                                                                                TYPE: bytes

                                                                                                                                                                id

                                                                                                                                                                The ID of the PDF document.

                                                                                                                                                                TYPE: (str, optional)

                                                                                                                                                                pages

                                                                                                                                                                The pages of the PDF document.

                                                                                                                                                                TYPE: List[Page]

                                                                                                                                                                error

                                                                                                                                                                Whether there was an error when processing this PDF document.

                                                                                                                                                                TYPE: (bool, optional)

                                                                                                                                                                content_boxes

                                                                                                                                                                The content boxes/annotations of the PDF document.

                                                                                                                                                                TYPE: List[Union[TextBox, ImageBox]]

                                                                                                                                                                aggregated_texts

                                                                                                                                                                The aggregated text outputs of the PDF document.

                                                                                                                                                                TYPE: Dict[str, Text]

                                                                                                                                                                text_boxes

                                                                                                                                                                The text boxes of the PDF document.

                                                                                                                                                                TYPE: List[TextBox]

                                                                                                                                                                "},{"location":"data-structures/#edspdf.structures.Page","title":"Page","text":"

                                                                                                                                                                Bases: BaseModel

                                                                                                                                                                The Page class represents a page of a PDF document.

                                                                                                                                                                ATTRIBUTE DESCRIPTION page_num

                                                                                                                                                                The page number of the page.

                                                                                                                                                                TYPE: int

                                                                                                                                                                width

                                                                                                                                                                The width of the page.

                                                                                                                                                                TYPE: float

                                                                                                                                                                height

                                                                                                                                                                The height of the page.

                                                                                                                                                                TYPE: float

                                                                                                                                                                doc

                                                                                                                                                                The PDF document that this page belongs to.

                                                                                                                                                                TYPE: PDFDoc

                                                                                                                                                                image

                                                                                                                                                                The rendered image of the page, stored as a NumPy array.

                                                                                                                                                                TYPE: Optional[ndarray]

                                                                                                                                                                text_boxes

                                                                                                                                                                The text boxes of the page.

                                                                                                                                                                TYPE: List[TextBox]

                                                                                                                                                                "},{"location":"data-structures/#edspdf.structures.TextProperties","title":"TextProperties","text":"

                                                                                                                                                                Bases: BaseModel

                                                                                                                                                                The TextProperties class represents the style properties of a span of text in a TextBox.

                                                                                                                                                                ATTRIBUTE DESCRIPTION italic

                                                                                                                                                                Whether the text is italic.

                                                                                                                                                                TYPE: bool

                                                                                                                                                                bold

                                                                                                                                                                Whether the text is bold.

                                                                                                                                                                TYPE: bool

                                                                                                                                                                begin

                                                                                                                                                                The beginning index of the span of text.

                                                                                                                                                                TYPE: int

                                                                                                                                                                end

                                                                                                                                                                The ending index of the span of text.

                                                                                                                                                                TYPE: int

                                                                                                                                                                fontname

                                                                                                                                                                The font name of the span of text.

                                                                                                                                                                TYPE: Optional[str]

                                                                                                                                                                "},{"location":"data-structures/#edspdf.structures.Box","title":"Box","text":"

                                                                                                                                                                Bases: BaseModel

                                                                                                                                                                The Box class represents a box annotation in a PDF document. It is the base class of TextBox.

                                                                                                                                                                ATTRIBUTE DESCRIPTION doc

                                                                                                                                                                The PDF document that this box belongs to.

                                                                                                                                                                TYPE: PDFDoc

                                                                                                                                                                page_num

                                                                                                                                                                The page number of the box.

                                                                                                                                                                TYPE: Optional[int]

                                                                                                                                                                x0

                                                                                                                                                                The left x-coordinate of the box.

                                                                                                                                                                TYPE: float

                                                                                                                                                                x1

                                                                                                                                                                The right x-coordinate of the box.

                                                                                                                                                                TYPE: float

                                                                                                                                                                y0

                                                                                                                                                                The top y-coordinate of the box.

                                                                                                                                                                TYPE: float

                                                                                                                                                                y1

                                                                                                                                                                The bottom y-coordinate of the box.

                                                                                                                                                                TYPE: float

                                                                                                                                                                label

                                                                                                                                                                The label of the box.

                                                                                                                                                                TYPE: Optional[str]

                                                                                                                                                                page

                                                                                                                                                                The page object that this box belongs to.

                                                                                                                                                                TYPE: Page

                                                                                                                                                                "},{"location":"data-structures/#edspdf.structures.Text","title":"Text","text":"

                                                                                                                                                                Bases: BaseModel

                                                                                                                                                                The TextBox class represents text object, not bound to any box.

                                                                                                                                                                It can be used to store aggregated text from multiple boxes for example.

                                                                                                                                                                ATTRIBUTE DESCRIPTION text

                                                                                                                                                                The text content.

                                                                                                                                                                TYPE: str

                                                                                                                                                                properties

                                                                                                                                                                The style properties of the text.

                                                                                                                                                                TYPE: List[TextProperties]

                                                                                                                                                                "},{"location":"data-structures/#edspdf.structures.TextBox","title":"TextBox","text":"

                                                                                                                                                                Bases: Box

                                                                                                                                                                The TextBox class represents a text box annotation in a PDF document.

                                                                                                                                                                ATTRIBUTE DESCRIPTION text

                                                                                                                                                                The text content of the text box.

                                                                                                                                                                TYPE: str

                                                                                                                                                                props

                                                                                                                                                                The style properties of the text box.

                                                                                                                                                                TYPE: List[TextProperties]

                                                                                                                                                                "},{"location":"data-structures/#edspdf.structures.PDFDoc","title":"Data Structures","text":""},{"location":"data-structures/#edspdf.structures.Page","title":"Data Structures","text":""},{"location":"data-structures/#edspdf.structures.TextProperties","title":"Data Structures","text":""},{"location":"data-structures/#edspdf.structures.Box","title":"Data Structures","text":""},{"location":"data-structures/#edspdf.structures.Text","title":"Data Structures","text":""},{"location":"data-structures/#edspdf.structures.TextBox","title":"Data Structures","text":""},{"location":"data-structures/#tensor-structure","title":"Tensor structure","text":"

                                                                                                                                                                The tensors used to process PDFs with deep learning models usually contain 4 main dimensions, in addition to the standard embedding dimensions:

                                                                                                                                                                • samples: one entry per PDF in the batch
                                                                                                                                                                • pages: one entry per page in a PDF
                                                                                                                                                                • boxes: one entry per box in a page
                                                                                                                                                                • token: one entry per token in a box (only for text boxes)

                                                                                                                                                                These tensors use a special FoldedTensor format to store the data in a compact way and reshape the data depending on the requirements of a layer.

                                                                                                                                                                "},{"location":"inference/","title":"Inference","text":"

                                                                                                                                                                Once you have obtained a pipeline, either by composing rule-based components, training a model or loading a model from the disk, you can use it to make predictions on documents. This is referred to as inference.

                                                                                                                                                                "},{"location":"inference/#inference-on-a-single-document","title":"Inference on a single document","text":"

                                                                                                                                                                In EDS-PDF, computing the prediction on a single document is done by calling the pipeline on the document. The input can be either:

                                                                                                                                                                • a sequence of bytes
                                                                                                                                                                • or a PDFDoc object
                                                                                                                                                                from pathlib import Path\n\npipeline = ...\ncontent = Path(\"path/to/.pdf\").read_bytes()\ndoc = pipeline(content)\n

                                                                                                                                                                If you're lucky enough to have a GPU, you can use it to speed up inference by moving the model to the GPU before calling the pipeline. To leverage multiple GPUs, refer to the multiprocessing accelerator description below.

                                                                                                                                                                pipeline.to(\"cuda\")  # same semantics as pytorch\ndoc = pipeline(content)\n
                                                                                                                                                                "},{"location":"inference/#inference-on-multiple-documents","title":"Inference on multiple documents","text":"

                                                                                                                                                                When processing multiple documents, it is usually more efficient to use the pipeline.pipe(...) method, especially when using deep learning components, since this allow matrix multiplications to be batched together. Depending on your computational resources and requirements, EDS-PDF comes with various \"accelerators\" to speed up inference (see the Accelerators section for more details). By default, the .pipe() method uses the simple accelerator but you can switch to a different one by passing the accelerator argument.

                                                                                                                                                                pipeline = ...\ndocs = pipeline.pipe(\n    [content1, content2, ...],\n    batch_size=16,  # optional, default to the one defined in the pipeline\n    accelerator=my_accelerator,\n)\n

                                                                                                                                                                The pipe method supports the following arguments :

                                                                                                                                                                PARAMETER DESCRIPTION inputs

                                                                                                                                                                The inputs to create the PDFDocs from, or the PDFDocs directly.

                                                                                                                                                                TYPE: Any

                                                                                                                                                                batch_size

                                                                                                                                                                The batch size to use. If not provided, the batch size of the pipeline object will be used.

                                                                                                                                                                TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                accelerator

                                                                                                                                                                The accelerator to use for processing the documents. If not provided, the default accelerator will be used.

                                                                                                                                                                TYPE: Optional[Union[str, Accelerator]] DEFAULT: None

                                                                                                                                                                to_doc

                                                                                                                                                                The function to use to convert the inputs to PDFDoc objects. By default, the content field of the inputs will be used if dict-like objects are provided, otherwise the inputs will be passed directly to the pipeline.

                                                                                                                                                                TYPE: Optional[ToDoc] DEFAULT: None

                                                                                                                                                                from_doc

                                                                                                                                                                The function to use to convert the PDFDoc objects to outputs. By default, the PDFDoc objects will be returned directly.

                                                                                                                                                                TYPE: FromDoc DEFAULT: lambda : doc

                                                                                                                                                                "},{"location":"inference/#accelerators","title":"Accelerators","text":""},{"location":"inference/#edspdf.accelerators.simple.SimpleAccelerator","title":"Simple accelerator","text":"

                                                                                                                                                                This is the simplest accelerator which batches the documents and process each batch on the main process (the one calling .pipe()).

                                                                                                                                                                "},{"location":"inference/#edspdf.accelerators.simple.SimpleAccelerator--examples","title":"Examples","text":"
                                                                                                                                                                docs = list(pipeline.pipe([content1, content2, ...]))\n

                                                                                                                                                                or, if you want to override the model defined batch size

                                                                                                                                                                docs = list(pipeline.pipe([content1, content2, ...], batch_size=8))\n

                                                                                                                                                                which is equivalent to passing a confit dict

                                                                                                                                                                docs = list(\n    pipeline.pipe(\n        [content1, content2, ...],\n        accelerator={\n            \"@accelerator\": \"simple\",\n            \"batch_size\": 8,\n        },\n    )\n)\n

                                                                                                                                                                or the instantiated accelerator directly

                                                                                                                                                                from edspdf.accelerators.simple import SimpleAccelerator\n\naccelerator = SimpleAccelerator(batch_size=8)\ndocs = list(pipeline.pipe([content1, content2, ...], accelerator=accelerator))\n

                                                                                                                                                                If you have a GPU, make sure to move the model to the appropriate device before calling .pipe(). If you have multiple GPUs, use the multiprocessing accelerator instead.

                                                                                                                                                                pipeline.to(\"cuda\")\ndocs = list(pipeline.pipe([content1, content2, ...]))\n
                                                                                                                                                                PARAMETER DESCRIPTION batch_size

                                                                                                                                                                The number of documents to process in each batch.

                                                                                                                                                                TYPE: int DEFAULT: 32

                                                                                                                                                                "},{"location":"inference/#edspdf.accelerators.multiprocessing.MultiprocessingAccelerator","title":"Multiprocessing accelerator","text":"

                                                                                                                                                                If you have multiple CPU cores, and optionally multiple GPUs, we provide a multiprocessing accelerator that allows to run the inference on multiple processes.

                                                                                                                                                                This accelerator dispatches the batches between multiple workers (data-parallelism), and distribute the computation of a given batch on one or two workers (model-parallelism). This is done by creating two types of workers:

                                                                                                                                                                • a CPUWorker which handles the non deep-learning components and the preprocessing, collating and postprocessing of deep-learning components
                                                                                                                                                                • a GPUWorker which handles the forward call of the deep-learning components

                                                                                                                                                                The advantage of dedicating a worker to the deep-learning components is that it allows to prepare multiple batches in parallel in multiple CPUWorker, and ensure that the GPUWorker never wait for a batch to be ready.

                                                                                                                                                                The overall architecture described in the following figure, for 3 CPU workers and 2 GPU workers.

                                                                                                                                                                Here is how a small pipeline with rule-based components and deep-learning components is distributed between the workers:

                                                                                                                                                                "},{"location":"inference/#edspdf.accelerators.multiprocessing.MultiprocessingAccelerator--examples","title":"Examples","text":"
                                                                                                                                                                docs = list(\n    pipeline.pipe(\n        [content1, content2, ...],\n        accelerator={\n            \"@accelerator\": \"multiprocessing\",\n            \"num_cpu_workers\": 3,\n            \"num_gpu_workers\": 2,\n            \"batch_size\": 8,\n        },\n    )\n)\n
                                                                                                                                                                PARAMETER DESCRIPTION batch_size

                                                                                                                                                                Number of documents to process at a time in a CPU/GPU worker

                                                                                                                                                                TYPE: int

                                                                                                                                                                num_cpu_workers

                                                                                                                                                                Number of CPU workers. A CPU worker handles the non deep-learning components and the preprocessing, collating and postprocessing of deep-learning components.

                                                                                                                                                                TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                num_gpu_workers

                                                                                                                                                                Number of GPU workers. A GPU worker handles the forward call of the deep-learning components.

                                                                                                                                                                TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                gpu_pipe_names

                                                                                                                                                                List of pipe names to accelerate on a GPUWorker, defaults to all pipes that inherit from TrainablePipe

                                                                                                                                                                TYPE: Optional[List[str]] DEFAULT: None

                                                                                                                                                                "},{"location":"pipeline/","title":"Pipeline","text":"

                                                                                                                                                                The goal of EDS-PDF is to provide a framework for processing PDF documents, along with some utilities and a few components, stitched together by a robust pipeline and configuration system.

                                                                                                                                                                Processing PDFs usually involves many steps such as extracting lines, running OCR models, detecting and classifying boxes, filtering and aggregating parts of the extracted texts, etc. Organising these steps together, combining static and deep learning components, while remaining modular and efficient is a challenge. This is why EDS-PDF is built on top of a new pipelining system.

                                                                                                                                                                Deep learning frameworks

                                                                                                                                                                The EDS-PDF trainable components are built around the PyTorch framework. While you can use any technology in static components, we do not provide tools to train components built with other deep learning frameworks.

                                                                                                                                                                "},{"location":"pipeline/#creating-a-pipeline","title":"Creating a pipeline","text":"

                                                                                                                                                                A pipe is a processing block (like a function) that applies a transformation on its input and returns a modified object.

                                                                                                                                                                At the moment, four types of pipes are implemented in the library:

                                                                                                                                                                1. extraction components extract lines from a raw PDF and return a PDFDoc object filled with these text boxes.
                                                                                                                                                                2. classification components classify each box with labels, such as body, header, footer...
                                                                                                                                                                3. aggregation components compiles the lines together according to their classes to re-create the original text.
                                                                                                                                                                4. embedding components don't directly update the annotations on the document but have specific deep-learning methods (see the TrainablePipe page) that can be composed to form a machine learning model.

                                                                                                                                                                To create your first pipeline, execute the following code:

                                                                                                                                                                from edspdf import Pipeline\n\nmodel = Pipeline()\n# will extract text lines from a document\nmodel.add_pipe(\n    \"pdfminer-extractor\",\n    config=dict(\n        extract_style=False,\n    ),\n)\n# classify everything inside the `body` bounding box as `body`\nmodel.add_pipe(\n    \"mask-classifier\", config=dict(body={\"x0\": 0.1, \"y0\": 0.1, \"x1\": 0.9, \"y1\": 0.9})\n)\n# aggregates the lines together to re-create the original text\nmodel.add_pipe(\"simple-aggregator\")\n

                                                                                                                                                                This pipeline can then be run on one or more PDF documents. As the pipeline process documents, components will be called in the order they were added to the pipeline.

                                                                                                                                                                from pathlib import Path\n\npdf_bytes = Path(\"path/to/your/pdf\").read_bytes()\n\n# Processing one document\nmodel(pdf_bytes)\n\n# Processing multiple documents\nmodel.pipe([pdf_bytes, ...])\n

                                                                                                                                                                For more information on how to use the pipeline, refer to the Inference page.

                                                                                                                                                                "},{"location":"pipeline/#hybrid-models","title":"Hybrid models","text":"

                                                                                                                                                                EDS-PDF was designed to facilitate the training and inference of hybrid models that arbitrarily chain static components or trained deep learning components. Static components are callable objects that take a PDFDoc object as input, perform arbitrary transformations over the input, and return the modified object. Trainable pipes, on the other hand, allow for deep learning operations to be performed on the PDFDoc object and must be trained to be used.

                                                                                                                                                                "},{"location":"pipeline/#saving-and-loading-a-pipeline","title":"Saving and loading a pipeline","text":"

                                                                                                                                                                Pipelines can be saved and loaded using the save and load methods. The saved pipeline is not a pickled objet but a folder containing the config file, the weights and extra resources for each pipeline. This allows for easy inspection and modification of the pipeline, and avoids the execution of arbitrary code when loading a pipeline.

                                                                                                                                                                model.save(\"path/to/your/model\")\nmodel = edspdf.load(\"path/to/your/model\")\n

                                                                                                                                                                To share the pipeline and turn it into a pip installable package, you can use the package method, which will use or create a pyproject.toml file, fill it accordingly, and create a wheel file. At the moment, we only support the poetry package manager.

                                                                                                                                                                model.package(\n    name=\"your-package-name\",  # leave None to reuse name in pyproject.toml\n    version=\"0.0.1\",\n    root_dir=\"path/to/project/root\",  # optional, to retrieve an existing pyproject.toml file\n    # if you don't have a pyproject.toml, you can provide the metadata here instead\n    metadata=dict(\n        authors=\"Firstname Lastname <your.email@domain.fr>\",\n        description=\"A short description of your package\",\n    ),\n)\n

                                                                                                                                                                This will create a wheel file in the root_dir/dist folder, which you can share and install with pip

                                                                                                                                                                "},{"location":"roadmap/","title":"Roadmap","text":"
                                                                                                                                                                • Style extraction
                                                                                                                                                                • Custom hybrid torch-based pipeline & configuration system
                                                                                                                                                                • Drop pandas DataFrame in favour of a ~~Cython~~ attr wrapper around PDF documents?
                                                                                                                                                                • Add training capabilities with a CLI to automate the annotation/preparation/training loop. Again, draw inspiration from spaCy, and maybe add the notion of a TrainableClassifier...
                                                                                                                                                                • Add complete serialisation capabilities, to save a full pipeline to disk. Draw inspiration from spaCy, which took great care to solve these issues: add save and load methods to every pipeline component
                                                                                                                                                                • Multiple-column extraction
                                                                                                                                                                • Table detector
                                                                                                                                                                • Integrate third-party OCR module
                                                                                                                                                                "},{"location":"trainable-pipes/","title":"Trainable pipes","text":"

                                                                                                                                                                Trainable pipes allow for deep learning operations to be performed on the PDFDoc object and must be trained to be used. Such pipes can be used to train a model to predict the label of the lines extracted from a PDF document.

                                                                                                                                                                "},{"location":"trainable-pipes/#anatomy-of-a-trainable-pipe","title":"Anatomy of a trainable pipe","text":"

                                                                                                                                                                Building and running deep learning models usually requires preprocessing the input sample into features, batching or \"collating\" these features together to process multiple samples at once, running deep learning operations over these features (in Pytorch, this step is done in the forward method) and postprocessing the outputs of these operation to complete the original sample.

                                                                                                                                                                In the trainable pipes of EDS-PDF, preprocessing and postprocessing are decoupled from the deep learning code but collocated with the forward method. This is achieved by splitting the class of a trainable component into four methods, which allows us to keep the development of new deep-learning components simple while ensuring efficient models both during training and inference.

                                                                                                                                                                "},{"location":"trainable-pipes/#edspdf.trainable_pipe.TrainablePipe.preprocess","title":"preprocess","text":"

                                                                                                                                                                Preprocess the document to extract features that will be used by the neural network to perform its predictions.

                                                                                                                                                                PARAMETER DESCRIPTION doc

                                                                                                                                                                PDFDocument to preprocess

                                                                                                                                                                TYPE: PDFDoc

                                                                                                                                                                RETURNS DESCRIPTION Dict[str, Any]

                                                                                                                                                                Dictionary (optionally nested) containing the features extracted from the document.

                                                                                                                                                                "},{"location":"trainable-pipes/#edspdf.trainable_pipe.TrainablePipe.collate","title":"collate","text":"

                                                                                                                                                                Collate the batch of features into a single batch of tensors that can be used by the forward method of the component.

                                                                                                                                                                PARAMETER DESCRIPTION batch

                                                                                                                                                                Batch of features

                                                                                                                                                                TYPE: NestedSequences

                                                                                                                                                                device

                                                                                                                                                                Device on which the tensors should be moved

                                                                                                                                                                TYPE: device

                                                                                                                                                                RETURNS DESCRIPTION InputBatch

                                                                                                                                                                Dictionary (optionally nested) containing the collated tensors

                                                                                                                                                                "},{"location":"trainable-pipes/#edspdf.trainable_pipe.TrainablePipe.forward","title":"forward","text":"

                                                                                                                                                                Perform the forward pass of the neural network, i.e, apply transformations over the collated features to compute new embeddings, probabilities, losses, etc

                                                                                                                                                                PARAMETER DESCRIPTION batch

                                                                                                                                                                Batch of tensors (nested dictionary) computed by the collate method

                                                                                                                                                                TYPE: InputBatch

                                                                                                                                                                RETURNS DESCRIPTION OutputBatch"},{"location":"trainable-pipes/#edspdf.trainable_pipe.TrainablePipe.postprocess","title":"postprocess","text":"

                                                                                                                                                                Update the documents with the predictions of the neural network, for instance converting label probabilities into label attributes on the document lines.

                                                                                                                                                                By default, this is a no-op.

                                                                                                                                                                PARAMETER DESCRIPTION docs

                                                                                                                                                                Batch of documents

                                                                                                                                                                TYPE: Sequence[PDFDoc]

                                                                                                                                                                batch

                                                                                                                                                                Batch of predictions, as returned by the forward method

                                                                                                                                                                TYPE: OutputBatch

                                                                                                                                                                RETURNS DESCRIPTION Sequence[PDFDoc]

                                                                                                                                                                Additionally, there is a fifth method:

                                                                                                                                                                "},{"location":"trainable-pipes/#edspdf.trainable_pipe.TrainablePipe.post_init","title":"post_init","text":"

                                                                                                                                                                This method completes the attributes of the component, by looking at some documents. It is especially useful to build vocabularies or detect the labels of a classification task.

                                                                                                                                                                PARAMETER DESCRIPTION gold_data

                                                                                                                                                                The documents to use for initialization.

                                                                                                                                                                TYPE: Iterable[PDFDoc]

                                                                                                                                                                exclude

                                                                                                                                                                The names of components to exclude from initialization. This argument will be gradually updated with the names of initialized components

                                                                                                                                                                TYPE: set

                                                                                                                                                                "},{"location":"trainable-pipes/#implementing-a-trainable-component","title":"Implementing a trainable component","text":"

                                                                                                                                                                Here is an example of a trainable component:

                                                                                                                                                                from typing import Any, Dict, Iterable, Sequence\n\nimport torch\nfrom tqdm import tqdm\n\nfrom edspdf import Pipeline, TrainablePipe, registry\nfrom edspdf.structures import PDFDoc\n\n\n@registry.factory.register(\"my-component\")\nclass MyComponent(TrainablePipe):\n    def __init__(\n        self,\n        # A subcomponent\n        pipeline: Pipeline,\n        name: str,\n        embedding: TrainablePipe,\n    ):\n        super().__init__(pipeline=pipeline, name=name)\n        self.embedding = embedding\n\n    def post_init(self, gold_data: Iterable[PDFDoc], exclude: set):\n        # Initialize the component with the gold documents\n        with self.label_vocabulary.initialization():\n            for doc in tqdm(gold_data, desc=\"Initializing the component\"):\n                # Do something like learning a vocabulary over the initialization\n                # documents\n                ...\n\n        # And post_init the subcomponent\n        exclude.add(self.name)\n        self.embedding.post_init(gold_data, exclude)\n\n        # Initialize any layer that might be missing from the module\n        self.classifier = torch.nn.Linear(...)\n\n    def preprocess(self, doc: PDFDoc, supervision: bool = False) -> Dict[str, Any]:\n        # Preprocess the doc to extract features required to run the embedding\n        # subcomponent, and this component\n        return {\n            \"embedding\": self.embedding.preprocess_supervised(doc),\n            \"my-feature\": ...(doc),\n        }\n\n    def collate(self, batch, device: torch.device) -> Dict:\n        # Collate the features of the \"embedding\" subcomponent\n        # and the features of this component as well\n        return {\n            \"embedding\": self.embedding.collate(batch[\"embedding\"], device),\n            \"my-feature\": torch.as_tensor(batch[\"my-feature\"], device=device),\n        }\n\n    def forward(self, batch: Dict, supervision=False) -> Dict:\n        # Call the embedding subcomponent\n        embeds = self.embedding(batch[\"embedding\"])\n\n        # Do something with the embedding tensors\n        output = ...(embeds)\n\n        return output\n\n    def postprocess(self, docs: Sequence[PDFDoc], output: Dict) -> Sequence[PDFDoc]:\n        # Annotate the docs with the outputs of the forward method\n        ...\n        return docs\n
                                                                                                                                                                "},{"location":"trainable-pipes/#nesting-trainable-pipes","title":"Nesting trainable pipes","text":"

                                                                                                                                                                Like pytorch modules, you can compose trainable pipes together to build complex architectures. For instance, a trainable classifier component may delegate some of its logic to an embedding component, which will only be responsible for converting PDF lines into multidimensional arrays of numbers.

                                                                                                                                                                Nesting pipes allows switching parts of the neural networks to test various architectures and keeping the modelling logic modular.

                                                                                                                                                                "},{"location":"trainable-pipes/#sharing-subcomponents","title":"Sharing subcomponents","text":"

                                                                                                                                                                Sharing parts of a neural network while training on different tasks can be an effective way to improve the network efficiency. For instance, it is common to share an embedding layer between multiple tasks that require embedding the same inputs.

                                                                                                                                                                In EDS-PDF, sharing a subcomponent is simply done by sharing the object between the multiple pipes. You can either refer to an existing subcomponent when configuring a new component in Python, or use the interpolation mechanism of our configuration system.

                                                                                                                                                                API-basedConfiguration-based
                                                                                                                                                                pipeline.add_pipe(\n    \"my-component-1\",\n    name=\"first\",\n    config={\n        \"embedding\": {\n            \"@factory\": \"box-embedding\",\n            # ...\n        }\n    },\n)\npipeline.add_pipe(\n    \"my-component-2\",\n    name=\"second\",\n    config={\n        \"embedding\": pipeline.components.first.embedding,\n    },\n)\n
                                                                                                                                                                [components.first]\n@factory = \"my-component-1\"\n\n[components.first.embedding]\n@factory = \"box-embedding\"\n...\n\n[components.second]\n@factory = \"my-component-2\"\nembedding = ${components.first.embedding}\n

                                                                                                                                                                To avoid recomputing the preprocess / forward and collate in the multiple components that use it, we rely on a light cache system.

                                                                                                                                                                During the training loop, when computing the loss for each component, the forward calls must be wrapped by the pipeline.cache() context to enable this caching mechanism between components.

                                                                                                                                                                "},{"location":"layers/","title":"Deep learning layers","text":"

                                                                                                                                                                EDS-PDF provides a set of specialized deep learning layers that can be used to build trainable components. These layers are built on top of the PyTorch framework and can be used in any PyTorch model.

                                                                                                                                                                Layer Description BoxTransformerModule Contextualize box embeddings with a 2d Transformer with relative position representations BoxTransformerLayer A single layer of the above BoxTransformerModule layer RelativeAttention A 2d attention layer that optionally uses relative position to compute its attention scores SinusoidalEmbedding A position embedding that uses trigonometric functions to encode positions Vocabulary A non deep learning layer to encodes / decode vocabularies"},{"location":"layers/box-transformer-layer/","title":"BoxTransformerLayer","text":"

                                                                                                                                                                BoxTransformerLayer combining a self attention layer and a linear->activation->linear transformation. This layer is used in the BoxTransformerModule module.

                                                                                                                                                                "},{"location":"layers/box-transformer-layer/#edspdf.layers.box_transformer.BoxTransformerLayer--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION input_size

                                                                                                                                                                Input embedding size

                                                                                                                                                                TYPE: int

                                                                                                                                                                num_heads

                                                                                                                                                                Number of attention heads in the attention layer

                                                                                                                                                                TYPE: int DEFAULT: 2

                                                                                                                                                                dropout_p

                                                                                                                                                                Dropout probability both for the attention layer and embedding projections

                                                                                                                                                                TYPE: float DEFAULT: 0.0

                                                                                                                                                                head_size

                                                                                                                                                                Head sizes of the attention layer

                                                                                                                                                                TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                activation

                                                                                                                                                                Activation function used in the linear->activation->linear transformation

                                                                                                                                                                TYPE: ActivationFunction DEFAULT: 'gelu'

                                                                                                                                                                init_resweight

                                                                                                                                                                Initial weight of the residual gates. At 0, the layer acts (initially) as an identity function, and at 1 as a standard Transformer layer. Initializing with a value close to 0 can help the training converge.

                                                                                                                                                                TYPE: float DEFAULT: 0.0

                                                                                                                                                                attention_mode

                                                                                                                                                                Mode of relative position infused attention layer. See the relative attention documentation for more information.

                                                                                                                                                                TYPE: Sequence[Literal['c2c', 'c2p', 'p2c']] DEFAULT: ('c2c', 'c2p', 'p2c')

                                                                                                                                                                position_embedding

                                                                                                                                                                Position embedding to use as key/query position embedding in the attention computation.

                                                                                                                                                                TYPE: Optional[Union[FloatTensor, Parameter]] DEFAULT: None

                                                                                                                                                                "},{"location":"layers/box-transformer-layer/#edspdf.layers.box_transformer.BoxTransformerLayer.forward","title":"forward","text":"

                                                                                                                                                                Forward pass of the BoxTransformerLayer

                                                                                                                                                                PARAMETER DESCRIPTION embeds

                                                                                                                                                                Embeddings to contextualize Shape: n_samples * n_keys * input_size

                                                                                                                                                                TYPE: FloatTensor

                                                                                                                                                                mask

                                                                                                                                                                Mask of the embeddings. 0 means padding element. Shape: n_samples * n_keys

                                                                                                                                                                TYPE: BoolTensor

                                                                                                                                                                relative_positions

                                                                                                                                                                Position of the keys relatively to the query elements Shape: n_samples * n_queries * n_keys * n_coordinates (2 for x/y)

                                                                                                                                                                TYPE: LongTensor

                                                                                                                                                                no_position_mask

                                                                                                                                                                Key / query pairs for which the position attention terms should be disabled. Shape: n_samples * n_queries * n_keys

                                                                                                                                                                TYPE: Optional[BoolTensor] DEFAULT: None

                                                                                                                                                                RETURNS DESCRIPTION Tuple[FloatTensor, FloatTensor]
                                                                                                                                                                • Contextualized embeddings Shape: n_samples * n_queries * n_keys
                                                                                                                                                                • Attention logits Shape: n_samples * n_queries * n_keys * n_heads
                                                                                                                                                                "},{"location":"layers/box-transformer/","title":"BoxTransformerModule","text":"

                                                                                                                                                                Box Transformer architecture combining a multiple BoxTransformerLayer modules. It is mainly used in BoxTransformer.

                                                                                                                                                                "},{"location":"layers/box-transformer/#edspdf.layers.box_transformer.BoxTransformerModule--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION input_size

                                                                                                                                                                Input embedding size

                                                                                                                                                                TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                num_heads

                                                                                                                                                                Number of attention heads in the attention layers

                                                                                                                                                                TYPE: int DEFAULT: 2

                                                                                                                                                                n_relative_positions

                                                                                                                                                                Maximum range of embeddable relative positions between boxes (further distances are capped to \u00b1n_relative_positions // 2)

                                                                                                                                                                TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                dropout_p

                                                                                                                                                                Dropout probability both for the attention layers and embedding projections

                                                                                                                                                                TYPE: float DEFAULT: 0.0

                                                                                                                                                                head_size

                                                                                                                                                                Head sizes of the attention layers

                                                                                                                                                                TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                activation

                                                                                                                                                                Activation function used in the linear->activation->linear transformations

                                                                                                                                                                TYPE: ActivationFunction DEFAULT: 'gelu'

                                                                                                                                                                init_resweight

                                                                                                                                                                Initial weight of the residual gates. At 0, the layer acts (initially) as an identity function, and at 1 as a standard Transformer layer. Initializing with a value close to 0 can help the training converge.

                                                                                                                                                                TYPE: float DEFAULT: 0.0

                                                                                                                                                                attention_mode

                                                                                                                                                                Mode of relative position infused attention layer. See the relative attention documentation for more information.

                                                                                                                                                                TYPE: Sequence[Literal['c2c', 'c2p', 'p2c']] DEFAULT: ('c2c', 'c2p', 'p2c')

                                                                                                                                                                n_layers

                                                                                                                                                                Number of layers in the Transformer

                                                                                                                                                                TYPE: int DEFAULT: 2

                                                                                                                                                                "},{"location":"layers/box-transformer/#edspdf.layers.box_transformer.BoxTransformerModule.forward","title":"forward","text":"

                                                                                                                                                                Forward pass of the BoxTransformer

                                                                                                                                                                PARAMETER DESCRIPTION embeds

                                                                                                                                                                Embeddings to contextualize Shape: n_samples * n_keys * input_size

                                                                                                                                                                TYPE: FoldedTensor

                                                                                                                                                                boxes

                                                                                                                                                                Layout features of the input elements

                                                                                                                                                                TYPE: Dict

                                                                                                                                                                RETURNS DESCRIPTION Tuple[FloatTensor, List[FloatTensor]]
                                                                                                                                                                • Output of the last BoxTransformerLayer Shape: n_samples * n_queries * n_keys
                                                                                                                                                                • Attention logits of all layers Shape: n_samples * n_queries * n_keys * n_heads
                                                                                                                                                                "},{"location":"layers/relative-attention/","title":"RelativeAttention","text":"

                                                                                                                                                                A self/cross-attention layer that takes relative position of elements into account to compute the attention weights. When running a relative attention layer, key and queries are represented using content and position embeddings, where position embeddings are retrieved using the relative position of keys relative to queries

                                                                                                                                                                "},{"location":"layers/relative-attention/#edspdf.layers.relative_attention.RelativeAttention--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION size

                                                                                                                                                                The size of the output embeddings Also serves as default if query_size, pos_size, or key_size is None

                                                                                                                                                                TYPE: int

                                                                                                                                                                n_heads

                                                                                                                                                                The number of attention heads

                                                                                                                                                                TYPE: int

                                                                                                                                                                query_size

                                                                                                                                                                The size of the query embeddings.

                                                                                                                                                                TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                key_size

                                                                                                                                                                The size of the key embeddings.

                                                                                                                                                                TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                value_size

                                                                                                                                                                The size of the value embeddings

                                                                                                                                                                TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                head_size

                                                                                                                                                                The size of each query / key / value chunk used in the attention dot product Default: key_size / n_heads

                                                                                                                                                                TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                position_embedding

                                                                                                                                                                The position embedding used as key and query embeddings

                                                                                                                                                                TYPE: Optional[Union[FloatTensor, Parameter]] DEFAULT: None

                                                                                                                                                                dropout_p

                                                                                                                                                                Dropout probability applied on the attention weights Default: 0.1

                                                                                                                                                                TYPE: float DEFAULT: 0.0

                                                                                                                                                                same_key_query_proj

                                                                                                                                                                Whether to use the same projection operator for content key and queries when computing the pre-attention key and query embedding chunks Default: False

                                                                                                                                                                TYPE: bool DEFAULT: False

                                                                                                                                                                same_positional_key_query_proj

                                                                                                                                                                Whether to use the same projection operator for content key and queries when computing the pre-attention key and query embedding chunks Default: False

                                                                                                                                                                TYPE: bool DEFAULT: False

                                                                                                                                                                n_coordinates

                                                                                                                                                                The number of positional coordinates For instance, text is 1D so 1 coordinate, images are 2D so 2 coordinates ... Default: 1

                                                                                                                                                                TYPE: int DEFAULT: 1

                                                                                                                                                                head_bias

                                                                                                                                                                Whether to learn a bias term to add to the attention logits This is only useful if you plan to use the attention logits for subsequent operations, since attention weights are unaffected by bias terms.

                                                                                                                                                                TYPE: bool DEFAULT: True

                                                                                                                                                                do_pooling

                                                                                                                                                                Whether to compute the output embedding. If you only plan to use attention logits, you should disable this parameter. Default: True

                                                                                                                                                                TYPE: bool DEFAULT: True

                                                                                                                                                                mode

                                                                                                                                                                Whether to compute content to content (c2c), content to position (c2p) or position to content (p2c) attention terms. Setting mode=('c2c\") disable relative position attention terms: this is the standard attention layer. To get a better intuition about these different types of attention, here is a formulation as fictitious search samples from a word in a (1D) text:

                                                                                                                                                                • content-content : \"my content is \u2019ultrasound\u2019 so I\u2019m looking for other words whose content contains information about temporality\"
                                                                                                                                                                • content-position: \"my content is \u2019ultrasound\u2019 so I\u2019m looking for other words that are 3 positions after of me\"
                                                                                                                                                                • position-content : \"regardless of my content, I will attend to the word one position after from me if it contains information about temporality, two words after me if it contains information about location, etc.\"

                                                                                                                                                                TYPE: Sequence[Literal['c2c', 'c2p', 'p2c']] DEFAULT: ('c2c', 'p2c', 'c2p')

                                                                                                                                                                n_additional_heads

                                                                                                                                                                The number of additional head logits to compute. Those are not used to compute output embeddings, but may be useful in subsequent operation. Default: 0

                                                                                                                                                                TYPE: int DEFAULT: 0

                                                                                                                                                                "},{"location":"layers/relative-attention/#edspdf.layers.relative_attention.RelativeAttention.forward","title":"forward","text":"

                                                                                                                                                                Forward pass of the RelativeAttention layer.

                                                                                                                                                                PARAMETER DESCRIPTION content_queries

                                                                                                                                                                The content query embedding to use in the attention computation Shape: n_samples * n_queries * query_size

                                                                                                                                                                TYPE: FloatTensor

                                                                                                                                                                content_keys

                                                                                                                                                                The content key embedding to use in the attention computation. If None, defaults to the content_queries Shape: n_samples * n_keys * query_size

                                                                                                                                                                TYPE: Optional[FloatTensor] DEFAULT: None

                                                                                                                                                                content_values

                                                                                                                                                                The content values embedding to use in the final pooling computation. If None, pooling won't be performed. Shape: n_samples * n_keys * query_size

                                                                                                                                                                TYPE: Optional[FloatTensor] DEFAULT: None

                                                                                                                                                                mask

                                                                                                                                                                The content key embedding to use in the attention computation. If None, defaults to the content_queries Shape: either - n_samples * n_keys - n_samples * n_queries * n_keys - n_samples * n_queries * n_keys * n_heads

                                                                                                                                                                TYPE: Optional[BoolTensor] DEFAULT: None

                                                                                                                                                                relative_positions

                                                                                                                                                                The relative position of keys relative to queries If None, positional attention terms won't be computed. Shape: n_samples * n_queries * n_keys * n_coordinates

                                                                                                                                                                TYPE: Optional[LongTensor] DEFAULT: None

                                                                                                                                                                no_position_mask

                                                                                                                                                                Key / query pairs for which the position attention terms should be disabled. Shape: n_samples * n_queries * n_keys

                                                                                                                                                                TYPE: Optional[BoolTensor] DEFAULT: None

                                                                                                                                                                base_attn

                                                                                                                                                                Attention logits to add to the computed attention logits Shape: n_samples * n_queries * n_keys * n_heads

                                                                                                                                                                TYPE: Optional[FloatTensor] DEFAULT: None

                                                                                                                                                                RETURNS DESCRIPTION Union[Tuple[FloatTensor, FloatTensor], FloatTensor]
                                                                                                                                                                • the output contextualized embeddings (only if content_values is not None and the do_pooling attribute is set to True) Shape: n_sample * n_keys * size
                                                                                                                                                                • the attention logits Shape: n_sample * n_keys * n_queries * (n_heads + n_additional_heads)
                                                                                                                                                                "},{"location":"layers/sinusoidal-embedding/","title":"SinusoidalEmbedding","text":"

                                                                                                                                                                A position embedding lookup table that stores embeddings for a fixed number of positions. The value of each of the embedding_dim channels of the generated embedding is generated according to a trigonometric function (sin for even channels, cos for odd channels). The frequency of the signal in each pair of channels varies according to the temperature parameter.

                                                                                                                                                                Any input position above the maximum value num_embeddings will be capped to num_embeddings - 1

                                                                                                                                                                "},{"location":"layers/sinusoidal-embedding/#edspdf.layers.sinusoidal_embedding.SinusoidalEmbedding--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION num_embeddings

                                                                                                                                                                The maximum number of position embeddings store in this table

                                                                                                                                                                TYPE: int

                                                                                                                                                                embedding_dim

                                                                                                                                                                The embedding size

                                                                                                                                                                TYPE: int

                                                                                                                                                                temperature

                                                                                                                                                                The temperature controls the range of frequencies used by each channel of the embedding

                                                                                                                                                                TYPE: float DEFAULT: 10000.0

                                                                                                                                                                "},{"location":"layers/sinusoidal-embedding/#edspdf.layers.sinusoidal_embedding.SinusoidalEmbedding.forward","title":"forward","text":"

                                                                                                                                                                Forward pass of the SinusoidalEmbedding module

                                                                                                                                                                PARAMETER DESCRIPTION indices

                                                                                                                                                                Shape: any

                                                                                                                                                                TYPE: LongTensor

                                                                                                                                                                RETURNS DESCRIPTION FloatTensor

                                                                                                                                                                Shape: (*input_shape, embedding_dim)

                                                                                                                                                                "},{"location":"layers/vocabulary/","title":"Vocabulary","text":"

                                                                                                                                                                Vocabulary layer. This is not meant to be used as a torch.nn.Module but subclassing torch.nn.Module makes the instances appear when printing a model, which is nice.

                                                                                                                                                                "},{"location":"layers/vocabulary/#edspdf.layers.vocabulary.Vocabulary--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION items

                                                                                                                                                                Initial vocabulary elements if any. Specific elements such as padding and unk can be set here to enforce their index in the vocabulary.

                                                                                                                                                                TYPE: Sequence[T] DEFAULT: None

                                                                                                                                                                default

                                                                                                                                                                Default index to use for out of vocabulary elements Defaults to -100

                                                                                                                                                                TYPE: int DEFAULT: -100

                                                                                                                                                                "},{"location":"layers/vocabulary/#edspdf.layers.vocabulary.Vocabulary-functions","title":"Functions","text":""},{"location":"layers/vocabulary/#edspdf.layers.vocabulary.Vocabulary.initialization","title":"initialization","text":"

                                                                                                                                                                Enters the initialization mode. Out of vocabulary elements will be assigned an index.

                                                                                                                                                                "},{"location":"layers/vocabulary/#edspdf.layers.vocabulary.Vocabulary.encode","title":"encode","text":"

                                                                                                                                                                Converts an element into its vocabulary index If the layer is in its initialization mode (with vocab.initialization(): ...), and the element is out of vocabulary, a new index will be created and returned. Otherwise, any oov element will be encoded with the default index.

                                                                                                                                                                PARAMETER DESCRIPTION item

                                                                                                                                                                RETURNS DESCRIPTION int"},{"location":"layers/vocabulary/#edspdf.layers.vocabulary.Vocabulary.decode","title":"decode","text":"

                                                                                                                                                                Converts an index into its original value

                                                                                                                                                                PARAMETER DESCRIPTION idx

                                                                                                                                                                RETURNS DESCRIPTION InputT"},{"location":"pipes/","title":"Components overview","text":"

                                                                                                                                                                EDS-PDF provides easy-to-use components for defining PDF processing pipelines.

                                                                                                                                                                Box extractorsBox classifiersAggregatorsEmbeddings Factory name Description pdfminer-extractor Extracts text lines with the pdfminer library mupdf-extractor Extracts text lines with the pymupdf library poppler-extractor Extracts text lines with the poppler library Factory name Description mask-classifier Simple rule-based classification multi-mask-classifier Simple rule-based classification dummy-classifier Dummy classifier, for testing purposes. random-classifier To sow chaos trainable-classifier Trainable box classification model Factory name Description simple-aggregator Returns a dictionary with one key for each detected class

                                                                                                                                                                Factory name Description simple-text-embedding A module that embeds the textual features of the blocks. embedding-combiner Encodes boxes using a combination of multiple encoders sub-box-cnn-pooler Pools the output of a CNN over the elements of a box (like words) box-layout-embedding Encodes the layout of the boxes box-transformer Contextualizes box representations using a transformer huggingface-embedding Box representations using a Huggingface multi-modal model.

                                                                                                                                                                You can add them to your EDS-PDF pipeline by simply calling add_pipe, for instance:

                                                                                                                                                                # \u2191 Omitted code that defines the pipeline object \u2191\npipeline.add_pipe(\"pdfminer-extractor\", name=\"component-name\", config=...)\n
                                                                                                                                                                "},{"location":"pipes/aggregators/","title":"Aggregation","text":"

                                                                                                                                                                The aggregation step compiles extracted text blocs together according to their detected class.

                                                                                                                                                                Factory name Description simple-aggregator Returns a dictionary with one key for each detected class"},{"location":"pipes/aggregators/simple-aggregator/","title":"Simple aggregator","text":""},{"location":"pipes/aggregators/simple-aggregator/#edspdf.pipes.aggregators.simple.SimpleAggregator","title":"SimpleAggregator","text":"

                                                                                                                                                                Aggregator that returns texts and styles. It groups all text boxes with the same label under the aggregated_text, and additionally aggregates the styles of the text boxes.

                                                                                                                                                                "},{"location":"pipes/aggregators/simple-aggregator/#edspdf.pipes.aggregators.simple.SimpleAggregator--examples","title":"Examples","text":"

                                                                                                                                                                Create a pipeline

                                                                                                                                                                API-basedConfiguration-based
                                                                                                                                                                pipeline = ...\npipeline.add_pipe(\n    \"simple-aggregator\",\n    name=\"aggregator\",\n    config={\n        \"new_line_threshold\": 0.2,\n        \"new_paragraph_threshold\": 1.5,\n        \"label_map\": {\n            \"body\": \"text\",\n            \"table\": \"text\",\n        },\n    },\n)\n
                                                                                                                                                                ...\n\n[components.aggregator]\n@factory = \"simple-aggregator\"\nnew_line_threshold = 0.2\nnew_paragraph_threshold = 1.5\nlabel_map = { body = \"text\", table = \"text\" }\n\n...\n

                                                                                                                                                                and run it on a document:

                                                                                                                                                                doc = pipeline(doc)\nprint(doc.aggregated_texts)\n# {\n#     \"text\": \"This is the body of the document, followed by a table | A | B |\"\n# }\n
                                                                                                                                                                "},{"location":"pipes/aggregators/simple-aggregator/#edspdf.pipes.aggregators.simple.SimpleAggregator--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION pipeline

                                                                                                                                                                The pipeline object

                                                                                                                                                                TYPE: Pipeline DEFAULT: None

                                                                                                                                                                name

                                                                                                                                                                The name of the component

                                                                                                                                                                TYPE: str DEFAULT: 'simple-aggregator'

                                                                                                                                                                sort

                                                                                                                                                                Whether to sort text boxes inside each label group by (page, y, x) position before merging them.

                                                                                                                                                                TYPE: bool DEFAULT: False

                                                                                                                                                                new_line_threshold

                                                                                                                                                                Minimum ratio of the distance between two lines to the median height of lines to consider them as being on separate lines

                                                                                                                                                                TYPE: float DEFAULT: 0.2

                                                                                                                                                                new_paragraph_threshold

                                                                                                                                                                Minimum ratio of the distance between two lines to the median height of lines to consider them as being on separate paragraphs and thus add a newline character between them.

                                                                                                                                                                TYPE: float DEFAULT: 1.5

                                                                                                                                                                label_map

                                                                                                                                                                A dictionary mapping labels to new labels. This is useful to group labels together, for instance, to output both \"body\" and \"table\" as \"text\".

                                                                                                                                                                TYPE: Dict DEFAULT: {}

                                                                                                                                                                Source code in edspdf/pipes/aggregators/simple.py
                                                                                                                                                                def __init__(\n    self,\n    pipeline: Pipeline = None,\n    name: str = \"simple-aggregator\",\n    sort: bool = False,\n    new_line_threshold: float = 0.2,\n    new_paragraph_threshold: float = 1.5,\n    label_map: Dict = {},\n) -> None:\n    self.name = name\n    self.sort = sort\n    self.label_map = dict(label_map)\n    self.new_line_threshold = new_line_threshold\n    self.new_paragraph_threshold = new_paragraph_threshold\n
                                                                                                                                                                "},{"location":"pipes/box-classifiers/","title":"Box classifiers","text":"

                                                                                                                                                                We developed EDS-PDF with modularity in mind. To that end, you can choose between multiple classification methods.

                                                                                                                                                                Factory name Description mask-classifier Simple rule-based classification multi-mask-classifier Simple rule-based classification dummy-classifier Dummy classifier, for testing purposes. random-classifier To sow chaos trainable-classifier Trainable box classification model"},{"location":"pipes/box-classifiers/dummy/","title":"Dummy classifier","text":"

                                                                                                                                                                Dummy classifier, for chaos purposes. Classifies each line to a random element.

                                                                                                                                                                "},{"location":"pipes/box-classifiers/dummy/#edspdf.pipes.classifiers.dummy.DummyClassifier--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION pipeline

                                                                                                                                                                The pipeline object.

                                                                                                                                                                TYPE: Pipeline DEFAULT: None

                                                                                                                                                                name

                                                                                                                                                                The name of the component.

                                                                                                                                                                TYPE: str DEFAULT: 'dummy-classifier'

                                                                                                                                                                label

                                                                                                                                                                The label to assign to each line.

                                                                                                                                                                TYPE: str

                                                                                                                                                                "},{"location":"pipes/box-classifiers/mask/","title":"Mask Classification","text":"

                                                                                                                                                                We developed a simple classifier that roughly uses the same strategy as PDFBox, namely:

                                                                                                                                                                • define a \"mask\" on the PDF documents ;
                                                                                                                                                                • keep every text bloc within that mask, tag everything else as pollution.
                                                                                                                                                                "},{"location":"pipes/box-classifiers/mask/#factories","title":"Factories","text":"

                                                                                                                                                                Two factories are available in the classifiers registry: mask-classifier and multi-mask-classifier.

                                                                                                                                                                "},{"location":"pipes/box-classifiers/mask/#edspdf.pipes.classifiers.mask.simple_mask_classifier_factory","title":"mask-classifier","text":"

                                                                                                                                                                The simplest form of mask classification. You define the mask, everything else is tagged as pollution.

                                                                                                                                                                PARAMETER DESCRIPTION pipeline

                                                                                                                                                                The pipeline object

                                                                                                                                                                TYPE: Pipeline DEFAULT: None

                                                                                                                                                                name

                                                                                                                                                                The name of the component

                                                                                                                                                                TYPE: str DEFAULT: 'mask-classifier'

                                                                                                                                                                x0

                                                                                                                                                                The x0 coordinate of the mask

                                                                                                                                                                TYPE: float

                                                                                                                                                                y0

                                                                                                                                                                The y0 coordinate of the mask

                                                                                                                                                                TYPE: float

                                                                                                                                                                x1

                                                                                                                                                                The x1 coordinate of the mask

                                                                                                                                                                TYPE: float

                                                                                                                                                                y1

                                                                                                                                                                The y1 coordinate of the mask

                                                                                                                                                                TYPE: float

                                                                                                                                                                threshold

                                                                                                                                                                The threshold for the alignment

                                                                                                                                                                TYPE: float DEFAULT: 1.0

                                                                                                                                                                "},{"location":"pipes/box-classifiers/mask/#edspdf.pipes.classifiers.mask.simple_mask_classifier_factory--examples","title":"Examples","text":"API-basedConfiguration-based
                                                                                                                                                                pipeline.add_pipe(\n    \"mask-classifier\",\n    name=\"classifier\",\n    config={\n        \"threshold\": 0.9,\n        \"x0\": 0.1,\n        \"y0\": 0.1,\n        \"x1\": 0.9,\n        \"y1\": 0.9,\n    },\n)\n
                                                                                                                                                                [components.classifier]\n@classifiers = \"mask-classifier\"\nx0 = 0.1\ny0 = 0.1\nx1 = 0.9\ny1 = 0.9\nthreshold = 0.9\n
                                                                                                                                                                "},{"location":"pipes/box-classifiers/mask/#edspdf.pipes.classifiers.mask.mask_classifier_factory","title":"multi-mask-classifier","text":"

                                                                                                                                                                A generalisation, wherein the user defines a number of regions.

                                                                                                                                                                The following configuration produces exactly the same classifier as mask.v1 example above.

                                                                                                                                                                Any bloc that is not part of a mask is tagged as pollution.

                                                                                                                                                                PARAMETER DESCRIPTION pipeline

                                                                                                                                                                The pipeline object

                                                                                                                                                                TYPE: Pipeline DEFAULT: None

                                                                                                                                                                name

                                                                                                                                                                TYPE: str DEFAULT: 'multi-mask-classifier'

                                                                                                                                                                threshold

                                                                                                                                                                The threshold for the alignment

                                                                                                                                                                TYPE: float DEFAULT: 1.0

                                                                                                                                                                masks

                                                                                                                                                                The masks

                                                                                                                                                                TYPE: Box DEFAULT: {}

                                                                                                                                                                "},{"location":"pipes/box-classifiers/mask/#edspdf.pipes.classifiers.mask.mask_classifier_factory--examples","title":"Examples","text":"API-basedConfiguration-based
                                                                                                                                                                pipeline.add_pipe(\n    \"multi-mask-classifier\",\n    name=\"classifier\",\n    config={\n        \"threshold\": 0.9,\n        \"mymask\": {\"x0\": 0.1, \"y0\": 0.1, \"x1\": 0.9, \"y1\": 0.3, \"label\": \"body\"},\n    },\n)\n
                                                                                                                                                                [components.classifier]\n@factory = \"multi-mask-classifier\"\nthreshold = 0.9\n\n[components.classifier.mymask]\nlabel = \"body\"\nx0 = 0.1\ny0 = 0.1\nx1 = 0.9\ny1 = 0.9\n

                                                                                                                                                                The following configuration defines a header region.

                                                                                                                                                                API-basedConfiguration-based
                                                                                                                                                                pipeline.add_pipe(\n    \"multi-mask-classifier\",\n    name=\"classifier\",\n    config={\n        \"threshold\": 0.9,\n        \"body\": {\"x0\": 0.1, \"y0\": 0.1, \"x1\": 0.9, \"y1\": 0.3, \"label\": \"header\"},\n        \"header\": {\"x0\": 0.1, \"y0\": 0.3, \"x1\": 0.9, \"y1\": 0.9, \"label\": \"body\"},\n    },\n)\n
                                                                                                                                                                [components.classifier]\n@factory = \"multi-mask-classifier\"\nthreshold = 0.9\n\n[components.classifier.header]\nlabel = \"header\"\nx0 = 0.1\ny0 = 0.1\nx1 = 0.9\ny1 = 0.3\n\n[components.classifier.body]\nlabel = \"body\"\nx0 = 0.1\ny0 = 0.3\nx1 = 0.9\ny1 = 0.9\n
                                                                                                                                                                "},{"location":"pipes/box-classifiers/random/","title":"Random classifier","text":"

                                                                                                                                                                Random classifier, for chaos purposes. Classifies each box to a random element.

                                                                                                                                                                "},{"location":"pipes/box-classifiers/random/#edspdf.pipes.classifiers.random.RandomClassifier--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION pipeline

                                                                                                                                                                The pipeline object.

                                                                                                                                                                TYPE: Pipeline

                                                                                                                                                                name

                                                                                                                                                                The name of the component.

                                                                                                                                                                TYPE: str DEFAULT: 'random-classifier'

                                                                                                                                                                labels

                                                                                                                                                                The labels to assign to each line. If a list is passed, each label is assigned with equal probability. If a dict is passed, the keys are the labels and the values are the probabilities.

                                                                                                                                                                TYPE: Union[List[str], Dict[str, float]]

                                                                                                                                                                "},{"location":"pipes/box-classifiers/trainable/","title":"Trainable classifier","text":"

                                                                                                                                                                This component predicts a label for each box over the whole document using machine learning.

                                                                                                                                                                Note

                                                                                                                                                                You must train the model your model to use this classifier. See Model training for more information

                                                                                                                                                                "},{"location":"pipes/box-classifiers/trainable/#edspdf.pipes.classifiers.trainable.TrainableClassifier--examples","title":"Examples","text":"

                                                                                                                                                                The classifier is composed of the following blocks:

                                                                                                                                                                • a configurable box embedding layer
                                                                                                                                                                • a linear classification layer

                                                                                                                                                                In this example, we use a box-embedding layer to generate the embeddings of the boxes. It is composed of a text encoder that embeds the text features of the boxes and a layout encoder that embeds the layout features of the boxes. These two embeddings are summed and passed through an optional contextualizer, here a box-transformer.

                                                                                                                                                                API-basedConfiguration-based
                                                                                                                                                                pipeline.add_pipe(\n    \"trainable-classifier\",\n    name=\"classifier\",\n    config={\n        # simple embedding computed by pooling embeddings of words in each box\n        \"embedding\": {\n            \"@factory\": \"sub-box-cnn-pooler\",\n            \"out_channels\": 64,\n            \"kernel_sizes\": (3, 4, 5),\n            \"embedding\": {\n                \"@factory\": \"simple-text-embedding\",\n                \"size\": 72,\n            },\n        },\n        \"labels\": [\"body\", \"pollution\"],\n    },\n)\n
                                                                                                                                                                [components.classifier]\n@factory = \"trainable-classifier\"\nlabels = [\"body\", \"pollution\"]\n\n[components.classifier.embedding]\n@factory = \"sub-box-cnn-pooler\"\nout_channels = 64\nkernel_sizes = (3, 4, 5)\n\n[components.classifier.embedding.embedding]\n@factory = \"simple-text-embedding\"\nsize = 72\n
                                                                                                                                                                "},{"location":"pipes/box-classifiers/trainable/#edspdf.pipes.classifiers.trainable.TrainableClassifier--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION labels

                                                                                                                                                                Initial labels of the classifier (will be completed during initialization)

                                                                                                                                                                TYPE: Sequence[str] DEFAULT: ('pollution')

                                                                                                                                                                embedding

                                                                                                                                                                Embedding module to encode the PDF boxes

                                                                                                                                                                TYPE: TrainablePipe[EmbeddingOutput]

                                                                                                                                                                "},{"location":"pipes/embeddings/","title":"Embeddings","text":"

                                                                                                                                                                We offer multiple embedding methods to encode the text and layout information of the PDFs. The following components can be added to a pipeline or composed together, and contain preprocessing and postprocessing logic to convert and batch documents.

                                                                                                                                                                Factory name Description simple-text-embedding A module that embeds the textual features of the blocks. embedding-combiner Encodes boxes using a combination of multiple encoders sub-box-cnn-pooler Pools the output of a CNN over the elements of a box (like words) box-layout-embedding Encodes the layout of the boxes box-transformer Contextualizes box representations using a transformer huggingface-embedding Box representations using a Huggingface multi-modal model.

                                                                                                                                                                Layers

                                                                                                                                                                These components are not to be confused with layers, which are standard PyTorch modules that can be used to build trainable components, such as the ones described here.

                                                                                                                                                                "},{"location":"pipes/embeddings/box-layout-embedding/","title":"BoxLayoutEmbedding","text":"

                                                                                                                                                                This component encodes the geometrical features of a box, as extracted by the BoxLayoutPreprocessor module, into an embedding. For position modes, use:

                                                                                                                                                                • \"sin\" to embed positions with a fixed SinusoidalEmbedding
                                                                                                                                                                • \"learned\" to embed positions using a learned standard pytorch embedding layer

                                                                                                                                                                Each produces embedding is the concatenation of the box width, height and the top, left, bottom and right coordinates, each embedded depending on the *_mode param.

                                                                                                                                                                "},{"location":"pipes/embeddings/box-layout-embedding/#edspdf.pipes.embeddings.box_layout_embedding.BoxLayoutEmbedding--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION size

                                                                                                                                                                Size of the output box embedding

                                                                                                                                                                TYPE: int

                                                                                                                                                                n_positions

                                                                                                                                                                Number of position embeddings stored in the PositionEmbedding module

                                                                                                                                                                TYPE: int

                                                                                                                                                                x_mode

                                                                                                                                                                Position embedding mode of the x coordinates

                                                                                                                                                                TYPE: Literal['sin', 'learned'] DEFAULT: 'sin'

                                                                                                                                                                y_mode

                                                                                                                                                                Position embedding mode of the x coordinates

                                                                                                                                                                TYPE: Literal['sin', 'learned'] DEFAULT: 'sin'

                                                                                                                                                                w_mode

                                                                                                                                                                Position embedding mode of the width features

                                                                                                                                                                TYPE: Literal['sin', 'learned'] DEFAULT: 'sin'

                                                                                                                                                                h_mode

                                                                                                                                                                Position embedding mode of the height features

                                                                                                                                                                TYPE: Literal['sin', 'learned'] DEFAULT: 'sin'

                                                                                                                                                                "},{"location":"pipes/embeddings/box-transformer/","title":"BoxTransformer","text":"

                                                                                                                                                                BoxTransformer using BoxTransformerModule under the hood.

                                                                                                                                                                Note

                                                                                                                                                                This module is a TrainablePipe and can be used in a Pipeline, while BoxTransformerModule is a standard PyTorch module, which does not take care of the preprocessing, collating, etc. of the input documents.

                                                                                                                                                                "},{"location":"pipes/embeddings/box-transformer/#edspdf.pipes.embeddings.box_transformer.BoxTransformer--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION pipeline

                                                                                                                                                                Pipeline instance

                                                                                                                                                                TYPE: Pipeline DEFAULT: None

                                                                                                                                                                name

                                                                                                                                                                Name of the component

                                                                                                                                                                TYPE: str DEFAULT: 'box-transformer'

                                                                                                                                                                num_heads

                                                                                                                                                                Number of attention heads in the attention layers

                                                                                                                                                                TYPE: int DEFAULT: 2

                                                                                                                                                                n_relative_positions

                                                                                                                                                                Maximum range of embeddable relative positions between boxes (further distances are capped to \u00b1n_relative_positions // 2)

                                                                                                                                                                TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                dropout_p

                                                                                                                                                                Dropout probability both for the attention layers and embedding projections

                                                                                                                                                                TYPE: float DEFAULT: 0.0

                                                                                                                                                                head_size

                                                                                                                                                                Head sizes of the attention layers

                                                                                                                                                                TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                activation

                                                                                                                                                                Activation function used in the linear->activation->linear transformations

                                                                                                                                                                TYPE: ActivationFunction DEFAULT: 'gelu'

                                                                                                                                                                init_resweight

                                                                                                                                                                Initial weight of the residual gates. At 0, the layer acts (initially) as an identity function, and at 1 as a standard Transformer layer. Initializing with a value close to 0 can help the training converge.

                                                                                                                                                                TYPE: float DEFAULT: 0.0

                                                                                                                                                                attention_mode

                                                                                                                                                                Mode of relative position infused attention layer. See the relative attention documentation for more information.

                                                                                                                                                                TYPE: Sequence[Literal['c2c', 'c2p', 'p2c']] DEFAULT: ('c2c', 'c2p', 'p2c')

                                                                                                                                                                n_layers

                                                                                                                                                                Number of layers in the Transformer

                                                                                                                                                                TYPE: int DEFAULT: 2

                                                                                                                                                                "},{"location":"pipes/embeddings/embedding-combiner/","title":"EmbeddingCombiner","text":"

                                                                                                                                                                Encodes boxes using a combination of multiple encoders

                                                                                                                                                                "},{"location":"pipes/embeddings/embedding-combiner/#edspdf.pipes.embeddings.embedding_combiner.EmbeddingCombiner--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION pipeline

                                                                                                                                                                The pipeline object

                                                                                                                                                                TYPE: Pipeline DEFAULT: None

                                                                                                                                                                name

                                                                                                                                                                The name of the pipe

                                                                                                                                                                TYPE: str DEFAULT: 'embedding-combiner'

                                                                                                                                                                mode

                                                                                                                                                                The mode to use to combine the encoders:

                                                                                                                                                                • sum: Sum the outputs of the encoders
                                                                                                                                                                • cat: Concatenate the outputs of the encoders

                                                                                                                                                                TYPE: Literal['sum', 'cat'] DEFAULT: 'sum'

                                                                                                                                                                dropout_p

                                                                                                                                                                Dropout probability used on the output of the box and textual encoders

                                                                                                                                                                TYPE: float DEFAULT: 0.0

                                                                                                                                                                encoders

                                                                                                                                                                The encoders to use. The keys are the names of the encoders and the values are the encoders themselves.

                                                                                                                                                                TYPE: TrainablePipe[EmbeddingOutput] DEFAULT: {}

                                                                                                                                                                "},{"location":"pipes/embeddings/huggingface-embedding/","title":"HuggingfaceEmbedding","text":"

                                                                                                                                                                The HuggingfaceEmbeddings component is a wrapper around the Huggingface multi-modal models. Such pre-trained models should offer better results than a model trained from scratch. Compared to using the raw Huggingface model, we offer a simple mechanism to split long documents into strided windows before feeding them to the model.

                                                                                                                                                                "},{"location":"pipes/embeddings/huggingface-embedding/#edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding--windowing","title":"Windowing","text":"

                                                                                                                                                                The HuggingfaceEmbedding component splits long documents into smaller windows before feeding them to the model. This is done to avoid hitting the maximum number of tokens that can be processed by the model on a single device. The window size and stride can be configured using the window and stride parameters. The default values are 510 and 255 respectively, which means that the model will process windows of 510 tokens, each separated by 255 tokens. Whenever a token appears in multiple windows, the embedding of the \"most contextualized\" occurrence is used, i.e. the occurrence that is the closest to the center of its window.

                                                                                                                                                                Here is an overview how this works in a classifier model :

                                                                                                                                                                "},{"location":"pipes/embeddings/huggingface-embedding/#edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding--examples","title":"Examples","text":"

                                                                                                                                                                Here is an example of how to define a pipeline with the HuggingfaceEmbedding component:

                                                                                                                                                                from edspdf import Pipeline\n\nmodel = Pipeline()\nmodel.add_pipe(\n    \"pdfminer-extractor\",\n    name=\"extractor\",\n    config={\n        \"render_pages\": True,\n    },\n)\nmodel.add_pipe(\n    \"huggingface-embedding\",\n    name=\"embedding\",\n    config={\n        \"model\": \"microsoft/layoutlmv3-base\",\n        \"use_image\": False,\n        \"window\": 128,\n        \"stride\": 64,\n        \"line_pooling\": \"mean\",\n    },\n)\nmodel.add_pipe(\n    \"trainable-classifier\",\n    name=\"classifier\",\n    config={\n        \"embedding\": model.get_pipe(\"embedding\"),\n        \"labels\": [],\n    },\n)\n

                                                                                                                                                                This model can then be trained following the training recipe.

                                                                                                                                                                "},{"location":"pipes/embeddings/huggingface-embedding/#edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION pipeline

                                                                                                                                                                The pipeline instance

                                                                                                                                                                TYPE: Pipeline DEFAULT: None

                                                                                                                                                                name

                                                                                                                                                                The component name

                                                                                                                                                                TYPE: str DEFAULT: 'huggingface-embedding'

                                                                                                                                                                model

                                                                                                                                                                The Huggingface model name or path

                                                                                                                                                                TYPE: str DEFAULT: None

                                                                                                                                                                use_image

                                                                                                                                                                Whether to use the image or not in the model

                                                                                                                                                                TYPE: bool DEFAULT: True

                                                                                                                                                                window

                                                                                                                                                                The window size to use when splitting long documents into smaller windows before feeding them to the Transformer model (default: 510 = 512 - 2)

                                                                                                                                                                TYPE: int DEFAULT: 510

                                                                                                                                                                stride

                                                                                                                                                                The stride (distance between windows) to use when splitting long documents into smaller windows: (default: 510 / 2 = 255)

                                                                                                                                                                TYPE: int DEFAULT: 255

                                                                                                                                                                line_pooling

                                                                                                                                                                The pooling strategy to use when combining the embeddings of the tokens in a line into a single line embedding

                                                                                                                                                                TYPE: Literal['mean', 'max', 'sum'] DEFAULT: 'mean'

                                                                                                                                                                max_tokens_per_device

                                                                                                                                                                The maximum number of tokens that can be processed by the model on a single device. This does not affect the results but can be used to reduce the memory usage of the model, at the cost of a longer processing time.

                                                                                                                                                                TYPE: int DEFAULT: 128 * 128

                                                                                                                                                                "},{"location":"pipes/embeddings/simple-text-embedding/","title":"SimpleTextEmbedding","text":"

                                                                                                                                                                A module that embeds the textual features of the blocks

                                                                                                                                                                "},{"location":"pipes/embeddings/simple-text-embedding/#edspdf.pipes.embeddings.simple_text_embedding.SimpleTextEmbedding--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION size

                                                                                                                                                                Size of the output box embedding

                                                                                                                                                                TYPE: int

                                                                                                                                                                pipeline

                                                                                                                                                                The pipeline object

                                                                                                                                                                TYPE: Pipeline DEFAULT: None

                                                                                                                                                                name

                                                                                                                                                                Name of the component

                                                                                                                                                                TYPE: str DEFAULT: 'simple-text-embedding'

                                                                                                                                                                "},{"location":"pipes/embeddings/sub-box-cnn-pooler/","title":"SubBoxCNNPooler","text":"

                                                                                                                                                                One dimension CNN encoding multi-kernel layer. Input embeddings are convoluted using linear kernels each parametrized with a (window) size of kernel_size[kernel_i] The output of the kernels are concatenated together, max-pooled and finally projected to a size of output_size.

                                                                                                                                                                "},{"location":"pipes/embeddings/sub-box-cnn-pooler/#edspdf.pipes.embeddings.sub_box_cnn_pooler.SubBoxCNNPooler--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION pipeline

                                                                                                                                                                Pipeline instance

                                                                                                                                                                TYPE: Pipeline DEFAULT: None

                                                                                                                                                                name

                                                                                                                                                                Name of the component

                                                                                                                                                                TYPE: str DEFAULT: 'sub-box-cnn-pooler'

                                                                                                                                                                output_size

                                                                                                                                                                Size of the output embeddings Defaults to the input_size

                                                                                                                                                                TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                out_channels

                                                                                                                                                                Number of channels

                                                                                                                                                                TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                kernel_sizes

                                                                                                                                                                Window size of each kernel

                                                                                                                                                                TYPE: Sequence[int] DEFAULT: (3, 4, 5)

                                                                                                                                                                activation

                                                                                                                                                                Activation function to use

                                                                                                                                                                TYPE: ActivationFunction DEFAULT: 'relu'

                                                                                                                                                                "},{"location":"pipes/extractors/","title":"Extraction","text":"

                                                                                                                                                                The extraction phase consists of reading the PDF document and gather text blocs, along with their dimensions and position within the document. Said blocs will go on to the classification phase to separate the body from the rest.

                                                                                                                                                                "},{"location":"pipes/extractors/#text-based-pdf","title":"Text-based PDF","text":"

                                                                                                                                                                We provide a multiple extractor architectures for text-based PDFs :

                                                                                                                                                                Factory name Description pdfminer-extractor Extracts text lines with the pdfminer library mupdf-extractor Extracts text lines with the pymupdf library poppler-extractor Extracts text lines with the poppler library"},{"location":"pipes/extractors/#image-based-pdf","title":"Image-based PDF","text":"

                                                                                                                                                                Image-based PDF documents require an OCR1 step, which is not natively supported by EDS-PDF. However, you can easily extend EDS-PDF by adding such a method to the registry.

                                                                                                                                                                We plan on adding such an OCR extractor component in the future.

                                                                                                                                                                1. Optical Character Recognition, or OCR, is the process of extracting characters and words from an image.\u00a0\u21a9

                                                                                                                                                                  "},{"location":"pipes/extractors/pdfminer/","title":"PdfMiner Extractor","text":"

                                                                                                                                                                  We provide a PDF line extractor built on top of PdfMiner.

                                                                                                                                                                  This is the most portable extractor, since it is pure-python and can therefore be run on any platform. Be sure to have a look at their documentation, especially the part providing a bird's eye view of the PDF extraction process.

                                                                                                                                                                  "},{"location":"pipes/extractors/pdfminer/#edspdf.pipes.extractors.pdfminer.PdfMinerExtractor--examples","title":"Examples","text":"API-basedConfiguration-based
                                                                                                                                                                  pipeline.add_pipe(\n    \"pdfminer-extractor\",\n    config=dict(\n        extract_style=False,\n    ),\n)\n
                                                                                                                                                                  [components.extractor]\n@factory = \"pdfminer-extractor\"\nextract_style = false\n

                                                                                                                                                                  And use the pipeline on a PDF document:

                                                                                                                                                                  from pathlib import Path\n\n# Apply on a new document\npipeline(Path(\"path/to/your/pdf/document\").read_bytes())\n
                                                                                                                                                                  "},{"location":"pipes/extractors/pdfminer/#edspdf.pipes.extractors.pdfminer.PdfMinerExtractor--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION line_overlap

                                                                                                                                                                  See PDFMiner documentation

                                                                                                                                                                  TYPE: float DEFAULT: 0.5

                                                                                                                                                                  char_margin

                                                                                                                                                                  See PDFMiner documentation

                                                                                                                                                                  TYPE: float DEFAULT: 2.05

                                                                                                                                                                  line_margin

                                                                                                                                                                  See PDFMiner documentation

                                                                                                                                                                  TYPE: float DEFAULT: 0.5

                                                                                                                                                                  word_margin

                                                                                                                                                                  See PDFMiner documentation

                                                                                                                                                                  TYPE: float DEFAULT: 0.1

                                                                                                                                                                  boxes_flow

                                                                                                                                                                  See PDFMiner documentation

                                                                                                                                                                  TYPE: Optional[float] DEFAULT: 0.5

                                                                                                                                                                  detect_vertical

                                                                                                                                                                  See PDFMiner documentation

                                                                                                                                                                  TYPE: bool DEFAULT: False

                                                                                                                                                                  all_texts

                                                                                                                                                                  See PDFMiner documentation

                                                                                                                                                                  TYPE: bool DEFAULT: False

                                                                                                                                                                  extract_style

                                                                                                                                                                  Whether to extract style (font, size, ...) information for each line of the document. Default: False

                                                                                                                                                                  TYPE: bool DEFAULT: False

                                                                                                                                                                  render_pages

                                                                                                                                                                  Whether to extract the rendered page as a numpy array in the page.image attribute (defaults to False)

                                                                                                                                                                  TYPE: bool DEFAULT: False

                                                                                                                                                                  render_dpi

                                                                                                                                                                  DPI to use when rendering the page (defaults to 200)

                                                                                                                                                                  TYPE: int DEFAULT: 200

                                                                                                                                                                  raise_on_error

                                                                                                                                                                  Whether to raise an error if the PDF cannot be parsed. Default: False

                                                                                                                                                                  TYPE: bool DEFAULT: False

                                                                                                                                                                  "},{"location":"recipes/","title":"EDS-PDF Recipes","text":"

                                                                                                                                                                  This section goes over a few use-cases for PDF extraction. It is meant as a more hands-on tutorial to get a grip on the library.

                                                                                                                                                                  "},{"location":"recipes/annotation/","title":"PDF Annotation","text":"

                                                                                                                                                                  In this section, we will cover one methodology to annotate PDF documents.

                                                                                                                                                                  Data annotation at AP-HP's CDW

                                                                                                                                                                  At AP-HP's CDW1, we recently moved away from a rule- and Java-based PDF extraction pipeline (using PDFBox) to one using EDS-PDF. Hence, EDS-PDF is used in production, helping extract text from around 100k PDF documents every day.

                                                                                                                                                                  To train our pipeline presently in production, we annotated around 270 documents, and reached a f1-score of 0.98 on the body classification.

                                                                                                                                                                  "},{"location":"recipes/annotation/#preparing-the-data-for-annotation","title":"Preparing the data for annotation","text":"

                                                                                                                                                                  We will frame the annotation phase as an image segmentation task, where annotators are asked to draw bounding boxes around the different sections. Hence, the very first step is to convert PDF documents to images. We suggest using the library pdf2image for that step.

                                                                                                                                                                  The following script will convert the PDF documents located in a data/pdfs directory to PNG images inside the data/images folder.

                                                                                                                                                                  import pdf2image\nfrom pathlib import Path\n\nDATA_DIR = Path(\"data\")\nPDF_DIR = DATA_DIR / \"pdfs\"\nIMAGE_DIR = DATA_DIR / \"images\"\n\nfor pdf in PDF_DIR.glob(\"*.pdf\"):\n    imgs = pdf2image.convert_from_bytes(pdf)\n\n    for page, img in enumerate(imgs):\n        path = IMAGE_DIR / f\"{pdf.stem}_{page}.png\"\n        img.save(path)\n

                                                                                                                                                                  You can use any annotation tool to annotate the images. If you're looking for a simple way to annotate from within a Jupyter Notebook, ipyannotations might be a good fit.

                                                                                                                                                                  You will need to post-process the output to convert the annotations to the following format:

                                                                                                                                                                  Key Description page Page within the PDF (0-indexed) x0 Horizontal position of the top-left corner of the bounding box x1 Horizontal position of the bottom-right corner of the bounding box y0 Vertical position of the top-left corner of the bounding box y1 Vertical position of the bottom-right corner of the bounding box label Class of the bounding box (eg body, header...)

                                                                                                                                                                  All dimensions should be normalised by the height and width of the page.

                                                                                                                                                                  "},{"location":"recipes/annotation/#saving-the-dataset","title":"Saving the dataset","text":"

                                                                                                                                                                  Once the annotation phase is complete, make sure the train/test split is performed once and for all when you create the dataset.

                                                                                                                                                                  We suggest the following structure:

                                                                                                                                                                  Directory structure
                                                                                                                                                                  dataset/\n\u251c\u2500\u2500 train/\n\u2502   \u251c\u2500\u2500 <note_id_1>.pdf\n\u2502   \u251c\u2500\u2500 <note_id_1>.json\n\u2502   \u251c\u2500\u2500 <note_id_2>.pdf\n\u2502   \u251c\u2500\u2500 <note_id_2>.json\n\u2502   \u2514\u2500\u2500 ...\n\u2514\u2500\u2500 test/\n    \u251c\u2500\u2500 <note_id_n>.pdf\n    \u251c\u2500\u2500 <note_id_n>.json\n    \u2514\u2500\u2500 ...\n

                                                                                                                                                                  Where the normalised annotation resides in a JSON file living next to the related PDF, and uses the following schema:

                                                                                                                                                                  Key Description note_id Reference to the document <properties> Optional property of the document itself annotations List of annotations, following the schema above

                                                                                                                                                                  This structure presents the advantage of being machine- and human-friendly. The JSON file contains annotated regions as well as any document property that could be useful to adapt the pipeline (typically for the classification step).

                                                                                                                                                                  "},{"location":"recipes/annotation/#extracting-annotations","title":"Extracting annotations","text":"

                                                                                                                                                                  The following snippet extracts the annotations into a workable format:

                                                                                                                                                                  from pathlib import Path\nimport pandas as pd\n\n\ndef get_annotations(\n    directory: Path,\n) -> pd.DataFrame:\n\"\"\"\n    Read annotations from the dataset directory.\n\n    Parameters\n    ----------\n    directory : Path\n        Dataset directory\n\n    Returns\n    -------\n    pd.DataFrame\n        Pandas DataFrame containing the annotations.\n    \"\"\"\n    dfs = []\n\n    iterator = tqdm(list(directory.glob(\"*.json\")))\n\n    for path in iterator:\n        meta = json.loads(path.read_text())\n        df = pd.DataFrame.from_records(meta.pop(\"annotations\"))\n\n        for k, v in meta.items():  # (1)\n            df[k] = v\n\n        dfs.append(df)\n\n    return pd.concat(dfs)\n\n\ntrain_path = Path(\"dataset/train\")\n\nannotations = get_annotations(train_path)\n
                                                                                                                                                                  1. Add a column for each additional property saved in the dataset.

                                                                                                                                                                  The annotations compiled this way can be used to train a pipeline. See the trained pipeline recipe for more detail.

                                                                                                                                                                  1. Greater Paris University Hospital's Clinical Data Warehouse\u00a0\u21a9

                                                                                                                                                                    "},{"location":"recipes/extension/","title":"Extending EDS-PDF","text":"

                                                                                                                                                                    EDS-PDF is organised around a function registry powered by catalogue and a custom configuration system. The result is a powerful framework that is easy to extend - and we'll see how in this section.

                                                                                                                                                                    For this recipe, let's imagine we're not entirely satisfied with the aggregation proposed by EDS-PDF. For instance, we might want an aggregator that outputs the text in Markdown format.

                                                                                                                                                                    Note

                                                                                                                                                                    Properly converting to markdown is no easy task. For this example, we will limit ourselves to detecting bold and italics sections.

                                                                                                                                                                    "},{"location":"recipes/extension/#developing-the-new-aggregator","title":"Developing the new aggregator","text":"

                                                                                                                                                                    Our aggregator will inherit from the SimpleAggregator, and use the style to detect italics and bold sections.

                                                                                                                                                                    markdown_aggregator.py
                                                                                                                                                                    from edspdf import registry\nfrom edspdf.pipes.aggregators.simple import SimpleAggregator\nfrom edspdf.structures import PDFDoc, Text\n\n\n@registry.factory.register(\"markdown-aggregator\")  # (1)\nclass MarkdownAggregator(SimpleAggregator):\n    def __call__(self, doc: PDFDoc) -> PDFDoc:\n        doc = super().__call__(doc)\n\n        for label in doc.aggregated_texts.keys():\n            text = doc.aggregated_texts[label].text\n\n            fragments = []\n\n            offset = 0\n            for s in doc.aggregated_texts[label].properties:\n                if s.begin >= s.end:\n                    continue\n                if offset < s.begin:\n                    fragments.append(text[offset : s.begin])\n\n                offset = s.end\n                snippet = text[s.begin : s.end]\n                if s.bold:\n                    snippet = f\"**{snippet}**\"\n                if s.italic:\n                    snippet = f\"_{snippet}_\"\n                fragments.append(snippet)\n\n            if offset < len(text):\n                fragments.append(text[offset:])\n\n            doc.aggregated_texts[label] = Text(text=\"\".join(fragments))\n\n        return doc\n
                                                                                                                                                                    1. The new aggregator is registered via this line
                                                                                                                                                                    2. The new aggregator redefines the __call__ method. It will output a single string, corresponding to the markdown-formatted output.

                                                                                                                                                                    That's it! You can use this new aggregator with the API:

                                                                                                                                                                    from edspdf import Pipeline\nfrom markdown_aggregator import MarkdownAggregator  # (1)\n\nmodel = Pipeline()\n# will extract text lines from a document\nmodel.add_pipe(\n    \"pdfminer-extractor\",\n    config=dict(\n        extract_style=False,\n    ),\n)\n# classify everything inside the `body` bounding box as `body`\nmodel.add_pipe(\"mask-classifier\", config={\"x0\": 0.1, \"y0\": 0.1, \"x1\": 0.9, \"y1\": 0.9})\n# aggregates the lines together to generate the markdown formatted text\nmodel.add_pipe(\"markdown-aggregator\")\n
                                                                                                                                                                    1. We're importing the aggregator that we just defined.

                                                                                                                                                                    It all works relatively smoothly!

                                                                                                                                                                    "},{"location":"recipes/extension/#making-the-aggregator-discoverable","title":"Making the aggregator discoverable","text":"

                                                                                                                                                                    Now, how can we instantiate the pipeline using the configuration system? The registry needs to be aware of the new function, but we shouldn't have to import mardown_aggregator.py just so that the module is registered as a side-effect...

                                                                                                                                                                    Catalogue solves this problem by using Python entry points.

                                                                                                                                                                    pyproject.tomlsetup.py
                                                                                                                                                                    [project.entry-points.\"edspdf_factories\"]\n\"markdown-aggregator\" = \"markdown_aggregator:MarkdownAggregator\"\n
                                                                                                                                                                    from setuptools import setup\n\nsetup(\n    name=\"edspdf-markdown-aggregator\",\n    entry_points={\n        \"edspdf_factories\": [\n            \"markdown-aggregator = markdown_aggregator:MarkdownAggregator\"\n        ]\n    },\n)\n

                                                                                                                                                                    By declaring the new aggregator as an entrypoint, it will become discoverable by EDS-PDF as long as it is installed in your environment!

                                                                                                                                                                    "},{"location":"recipes/rule-based/","title":"Rule-based extraction","text":"

                                                                                                                                                                    Let's create a rule-based extractor for PDF documents.

                                                                                                                                                                    Note

                                                                                                                                                                    This pipeline will likely perform poorly as soon as your PDF documents come in varied forms. In that case, even a very simple trained pipeline may give you a substantial performance boost (see next section).

                                                                                                                                                                    First, download this example PDF.

                                                                                                                                                                    We will use the following configuration:

                                                                                                                                                                    config.cfg
                                                                                                                                                                    [pipeline]\ncomponents = [\"extractor\", \"classifier\", \"aggregator\"]\ncomponents_config = ${components}\n\n[components.extractor]\n@factory = \"pdfminer-extractor\"  # (2)\nextract_style = true\n\n[components.classifier]\n@factory = \"mask-classifier\"  # (3)\nx0 = 0.2\nx1 = 0.9\ny0 = 0.3\ny1 = 0.6\nthreshold = 0.1\n\n[components.aggregator]\n@factory = \"styled-aggregator\"  # (4)\n
                                                                                                                                                                    1. This is the top-level object, which organises the entire extraction process.
                                                                                                                                                                    2. Here we use the provided text-based extractor, based on the PDFMiner library
                                                                                                                                                                    3. This is where we define the rule-based classifier. Here, we use a \"mask\", meaning that every text bloc that falls within the boundaries will be assigned the body label, everything else will be tagged as pollution.
                                                                                                                                                                    4. This aggregator returns a tuple of dictionaries. The first contains compiled text for each label, the second exports their style.

                                                                                                                                                                    Save the configuration as config.cfg and run the following snippet:

                                                                                                                                                                    import edspdf\nimport pandas as pd\nfrom pathlib import Path\n\nmodel = edspdf.load(\"config.cfg\")  # (1)\n\n# Get a PDF\npdf = Path(\"/Users/perceval/Development/edspdf/tests/resources/letter.pdf\").read_bytes()\npdf = model(pdf)\n\nbody = pdf.aggregated_texts[\"body\"]\n\ntext, style = body.text, body.properties\nprint(text)\nprint(pd.DataFrame(style))\n

                                                                                                                                                                    This code will output the following results:

                                                                                                                                                                    VisualisationExtracted TextExtracted Style

                                                                                                                                                                    Cher Pr ABC, Cher DEF,\n\nNous souhaitons remercier le CSE pour son avis favorable quant \u00e0 l\u2019acc\u00e8s aux donn\u00e9es de\nl\u2019Entrep\u00f4t de Donn\u00e9es de Sant\u00e9 du projet n\u00b0 XXXX.\n\nNous avons bien pris connaissance des conditions requises pour cet avis favorable, c\u2019est\npourquoi nous nous engageons par la pr\u00e9sente \u00e0 :\n\n\u2022 Informer individuellement les patients concern\u00e9s par la recherche, admis \u00e0 l'AP-HP\navant juillet 2017, sortis vivants, et non r\u00e9admis depuis.\n\n\u2022 Effectuer une demande d'autorisation \u00e0 la CNIL en cas d'appariement avec d\u2019autres\ncohortes.\n\nBien cordialement,\n

                                                                                                                                                                    The start and end columns refer to the character indices within the extracted text.

                                                                                                                                                                    italic bold fontname start end False False BCDFEE+Calibri 0 22 False False BCDFEE+Calibri 24 90 False False BCDHEE+Calibri 90 91 False False BCDFEE+Calibri 91 111 False False BCDFEE+Calibri 112 113 False False BCDHEE+Calibri 113 114 False False BCDFEE+Calibri 114 161 False False BCDFEE+Calibri 163 247 False False BCDHEE+Calibri 247 248 False False BCDFEE+Calibri 248 251 False False BCDFEE+Calibri 252 300 False False SymbolMT 302 303 False False BCDFEE+Calibri 304 386 False False BCDFEE+Calibri 387 445 False False SymbolMT 447 448 False False BCDFEE+Calibri 449 523 False False BCDHEE+Calibri 523 524 False False BCDFEE+Calibri 524 530 False False BCDFEE+Calibri 531 540 False False BCDFEE+Calibri 542 560
                                                                                                                                                                      "},{"location":"recipes/training/","title":"Training a Pipeline","text":"

                                                                                                                                                                      In this chapter, we'll see how we can train a deep-learning based classifier to better classify the lines of the document and extract texts from the document.

                                                                                                                                                                      "},{"location":"recipes/training/#step-by-step-walkthrough","title":"Step-by-step walkthrough","text":"

                                                                                                                                                                      Training supervised models consists in feeding batches of samples taken from a training corpus to a model instantiated from a given architecture and optimizing the learnable weights of the model to decrease a given loss. The process of training a pipeline with EDS-PDF is as follows:

                                                                                                                                                                      1. We first start by seeding the random states and instantiating a new trainable pipeline. Here we show two examples of pipeline, the first one based on a custom embedding architecture and the second one based on a pre-trained HuggingFace transformer model.

                                                                                                                                                                        Custom architecturePre-trained HuggingFace transformer

                                                                                                                                                                        The architecture of the trainable classifier of this recipe is described in the following figure:

                                                                                                                                                                        from edspdf import Pipeline\nfrom edspdf.utils.random import set_seed\n\nset_seed(42)\n\nmodel = Pipeline()\nmodel.add_pipe(\"pdfminer-extractor\", name=\"extractor\") # (1)\nmodel.add_pipe(\n    \"box-transformer\",\n    name=\"embedding\",\n    config={\n        \"num_heads\": 4,\n        \"dropout_p\": 0.1,\n        \"activation\": \"gelu\",\n        \"init_resweight\": 0.01,\n        \"head_size\": 16,\n        \"attention_mode\": [\"c2c\", \"c2p\", \"p2c\"],\n        \"n_layers\": 1,\n        \"n_relative_positions\": 64,\n        \"embedding\": {\n            \"@factory\": \"embedding-combiner\",\n            \"dropout_p\": 0.1,\n            \"text_encoder\": {\n                \"@factory\": \"sub-box-cnn-pooler\",\n                \"out_channels\": 64,\n                \"kernel_sizes\": (3, 4, 5),\n                \"embedding\": {\n                    \"@factory\": \"simple-text-embedding\",\n                    \"size\": 72,\n                },\n            },\n            \"layout_encoder\": {\n                \"@factory\": \"box-layout-embedding\",\n                \"n_positions\": 64,\n                \"x_mode\": \"learned\",\n                \"y_mode\": \"learned\",\n                \"w_mode\": \"learned\",\n                \"h_mode\": \"learned\",\n                \"size\": 72,\n            },\n        },\n    },\n)\nmodel.add_pipe(\n    \"trainable-classifier\",\n    name=\"classifier\",\n    config={\n        \"embedding\": model.get_pipe(\"embedding\"),\n        \"labels\": [],\n    },\n)\n
                                                                                                                                                                        1. You can choose between multiple extractors, such as \"pdfminer-extractor\", \"mupdf-extractor\" or \"poppler-extractor\" (the latter does not support rendering images). See the extractors list here extractors for more details.
                                                                                                                                                                        model = Pipeline()\nmodel.add_pipe(\n    \"mupdf-extractor\",\n    name=\"extractor\",\n    config={\n        \"render_pages\": True,\n    },\n) # (1)\nmodel.add_pipe(\n    \"huggingface-embedding\",\n    name=\"embedding\",\n    config={\n        \"model\": \"microsoft/layoutlmv3-base\",\n        \"use_image\": False,\n        \"window\": 128,\n        \"stride\": 64,\n        \"line_pooling\": \"mean\",\n    },\n)\nmodel.add_pipe(\n    \"trainable-classifier\",\n    name=\"classifier\",\n    config={\n        \"embedding\": model.get_pipe(\"embedding\"),\n        \"labels\": [],\n    },\n)\n
                                                                                                                                                                        1. You can choose between multiple extractors, such as \"pdfminer-extractor\", \"mupdf-extractor\" or \"poppler-extractor\" (the latter does not support rendering images). See the extractors list here extractors for more details.
                                                                                                                                                                      2. We then load and adapt (i.e., convert into PDFDoc) the training and validation dataset, which is often a combination of JSON and PDF files. The recommended way of doing this is to make a Python generator of PDFDoc objects.

                                                                                                                                                                        train_docs = list(segmentation_adapter(train_path)(model))\nval_docs = list(segmentation_adapter(val_path)(model))\n

                                                                                                                                                                      3. We initialize the missing or incomplete components attributes (such as vocabularies) with the training dataset

                                                                                                                                                                        model.post_init(train_docs)\n

                                                                                                                                                                      4. The training dataset is then preprocessed into features. The resulting preprocessed dataset is then wrapped into a pytorch DataLoader to be fed to the model during the training loop with the model's own collate method.

                                                                                                                                                                        preprocessed = list(model.preprocess_many(train_docs, supervision=True))\ndataloader = DataLoader(\n    preprocessed,\n    batch_size=batch_size,\n    collate_fn=model.collate,\n    shuffle=True,\n)\n

                                                                                                                                                                      5. We instantiate an optimizer and start the training loop

                                                                                                                                                                        from itertools import chain, repeat\n\noptimizer = torch.optim.AdamW(\n    params=model.parameters(),\n    lr=lr,\n)\n\n# We will loop over the dataloader\niterator = chain.from_iterable(repeat(dataloader))\n\nfor step in tqdm(range(max_steps), \"Training model\", leave=True):\n    batch = next(iterator)\n    optimizer.zero_grad()\n

                                                                                                                                                                      6. The trainable components are fed the collated batches from the dataloader with the TrainablePipe.module_forward methods to compute the losses. Since outputs of shared subcomponents are reused between components, we enable caching by wrapping this step in a cache context. The training loop is otherwise carried in a similar fashion to a standard pytorch training loop

                                                                                                                                                                        with model.cache():\n    loss = torch.zeros((), device=\"cpu\")\n    for name, component in model.trainable_pipes():\n        output = component.module_forward(batch[component.name])\n        if \"loss\" in output:\n            loss += output[\"loss\"]\n\n    loss.backward()\n\n    optimizer.step()\n

                                                                                                                                                                      7. Finally, the model is evaluated on the validation dataset at regular intervals and saved at the end of the training. To score the model, we only want to run \"classifier\" component and not the extractor, otherwise we would overwrite annotated text boxes on documents in the val_docs dataset, and have mismatching text boxes between the gold and predicted documents. To save the model, although you can use torch.save to save your model, we provide a safer method to avoid the security pitfalls of pickle models

                                                                                                                                                                        from edspdf import Pipeline\nfrom sklearn.metrics import classification_report\nfrom copy import deepcopy\n\n\ndef score(golds, preds):\n    return classification_report(\n        [b.label for gold in golds for b in gold.text_boxes if b.text != \"\"],\n        [b.label for pred in preds for b in pred.text_boxes if b.text != \"\"],\n        output_dict=True,\n        zero_division=0,\n    )\n\n\n...\n\nif (step % 100) == 0:\n    # we only want to run \"classifier\" component, not overwrite the text boxes\n    with model.select_pipes(enable=[\"classifier\"]):\n        print(score(val_docs, model.pipe(deepcopy(val_docs))))\n\n# torch.save(model, \"model.pt\")\nmodel.save(\"model\")\n

                                                                                                                                                                      "},{"location":"recipes/training/#adapting-a-dataset","title":"Adapting a dataset","text":"

                                                                                                                                                                      The first step of training a pipeline is to adapt the dataset to the pipeline. This is done by converting the dataset into a list of PDFDoc objects, using an extractor. The following function loads a dataset of .pdf and .json files, where each .json file contain box annotations represented with page, x0, x1, y0, y1 and label.

                                                                                                                                                                      from edspdf.utils.alignment import align_box_labels\nfrom pathlib import Path\nfrom pydantic import DirectoryPath\nfrom edspdf.registry import registry\nfrom edspdf.structures import Box\nimport json\n\n\n@registry.adapter.register(\"my-segmentation-adapter\")\ndef segmentation_adapter(\n    path: DirectoryPath,\n):\n    def adapt_to(model):\n        for anns_filepath in sorted(Path(path).glob(\"*.json\")):\n            pdf_filepath = str(anns_filepath).replace(\".json\", \".pdf\")\n            with open(anns_filepath) as f:\n                sample = json.load(f)\n            pdf = Path(pdf_filepath).read_bytes()\n\n            if len(sample[\"annotations\"]) == 0:\n                continue\n\n            doc = model.components.extractor(pdf)\n            doc.id = pdf_filepath.split(\".\")[0].split(\"/\")[-1]\n            doc.lines = [\n                line\n                for page in sorted(set(b.page for b in doc.lines))\n                for line in align_box_labels(\n                    src_boxes=[\n                        Box(\n                            page_num=b[\"page\"],\n                            x0=b[\"x0\"],\n                            x1=b[\"x1\"],\n                            y0=b[\"y0\"],\n                            y1=b[\"y1\"],\n                            label=b[\"label\"],\n                        )\n                        for b in sample[\"annotations\"]\n                        if b[\"page\"] == page\n                    ],\n                    dst_boxes=doc.lines,\n                    pollution_label=None,\n                )\n                if line.text == \"\" or line.label is not None\n            ]\n            yield doc\n\n    return adapt_to\n
                                                                                                                                                                      "},{"location":"recipes/training/#full-example","title":"Full example","text":"

                                                                                                                                                                      Let's wrap the training code in a function, and make it callable from the command line using confit !

                                                                                                                                                                      train.py
                                                                                                                                                                      import itertools\nimport json\nfrom copy import deepcopy\nfrom pathlib import Path\n\nimport torch\nfrom confit import Cli\nfrom pydantic import DirectoryPath\nfrom torch.utils.data import DataLoader\nfrom tqdm import tqdm\n\nfrom edspdf import Pipeline, registry\nfrom edspdf.structures import Box\nfrom edspdf.utils.alignment import align_box_labels\nfrom edspdf.utils.random import set_seed\n\napp = Cli(pretty_exceptions_show_locals=False)\n\n\ndef score(golds, preds):\n    return classification_report(\n        [b.label for gold in golds for b in gold.text_boxes if b.text != \"\"],\n        [b.label for pred in preds for b in pred.text_boxes if b.text != \"\"],\n        output_dict=True,\n        zero_division=0,\n    )\n\n\n@registry.adapter.register(\"my-segmentation-adapter\")\ndef segmentation_adapter(\n    path: str,\n):\n    def adapt_to(model):\n        for anns_filepath in sorted(Path(path).glob(\"*.json\")):\n            pdf_filepath = str(anns_filepath).replace(\".json\", \".pdf\")\n            with open(anns_filepath) as f:\n                sample = json.load(f)\n            pdf = Path(pdf_filepath).read_bytes()\n\n            if len(sample[\"annotations\"]) == 0:\n                continue\n\n            doc = model.get_pipe(\"extractor\")(pdf)\n            doc.id = pdf_filepath.split(\".\")[0].split(\"/\")[-1]\n            doc.content_boxes = [\n                line\n                for page_num in sorted(set(b.page_num for b in doc.lines))\n                for line in align_box_labels(\n                    src_boxes=[\n                        Box(\n                            page_num=b[\"page\"],\n                            x0=b[\"x0\"],\n                            x1=b[\"x1\"],\n                            y0=b[\"y0\"],\n                            y1=b[\"y1\"],\n                            label=b[\"label\"],\n                        )\n                        for b in sample[\"annotations\"]\n                        if b[\"page\"] == page_num\n                    ],\n                    dst_boxes=doc.lines,\n                    pollution_label=None,\n                )\n                if line.text == \"\" or line.label is not None\n            ]\n            yield doc\n\n    return adapt_to\n\n\n@app.command(name=\"train\")\ndef train_my_model(\n    train_path: DirectoryPath = \"dataset/train\",\n    val_path: DirectoryPath = \"dataset/dev\",\n    max_steps: int = 1000,\n    batch_size: int = 4,\n    lr: float = 3e-4,\n):\n    set_seed(42)\n\n    # We define the model\n    model = Pipeline()\n    model.add_pipe(\"mupdf-extractor\", name=\"extractor\")\n    model.add_pipe(\n        \"box-transformer\",\n        name=\"embedding\",\n        config={\n            \"num_heads\": 4,\n            \"dropout_p\": 0.1,\n            \"activation\": \"gelu\",\n            \"init_resweight\": 0.01,\n            \"head_size\": 16,\n            \"attention_mode\": [\"c2c\", \"c2p\", \"p2c\"],\n            \"n_layers\": 1,\n            \"n_relative_positions\": 64,\n            \"embedding\": {\n                \"@factory\": \"embedding-combiner\",\n                \"dropout_p\": 0.1,\n                \"text_encoder\": {\n                    \"@factory\": \"sub-box-cnn-pooler\",\n                    \"out_channels\": 64,\n                    \"kernel_sizes\": (3, 4, 5),\n                    \"embedding\": {\n                        \"@factory\": \"simple-text-embedding\",\n                        \"size\": 72,\n                    },\n                },\n                \"layout_encoder\": {\n                    \"@factory\": \"box-layout-embedding\",\n                    \"n_positions\": 64,\n                    \"x_mode\": \"learned\",\n                    \"y_mode\": \"learned\",\n                    \"w_mode\": \"learned\",\n                    \"h_mode\": \"learned\",\n                    \"size\": 72,\n                },\n            },\n        },\n    )\n    model.add_pipe(\n        \"trainable-classifier\",\n        name=\"classifier\",\n        config={\n            \"embedding\": model.get_pipe(\"embedding\"),\n            \"labels\": [],\n        },\n    )\n\n    # Loading and adapting the training and validation data\n    train_docs = list(segmentation_adapter(train_path)(model))\n    val_docs = list(segmentation_adapter(val_path)(model))\n\n    # Taking the first `initialization_subset` samples to initialize the model\n    model.post_init(train_docs)\n\n    # Preprocessing the training dataset into a dataloader\n    preprocessed = list(model.preprocess_many(train_docs, supervision=True))\n    dataloader = DataLoader(\n        preprocessed,\n        batch_size=batch_size,\n        collate_fn=model.collate,\n        shuffle=True,\n    )\n\n    optimizer = torch.optim.AdamW(\n        params=model.parameters(),\n        lr=lr,\n    )\n\n    # We will loop over the dataloader\n    iterator = itertools.chain.from_iterable(itertools.repeat(dataloader))\n\n    for step in tqdm(range(max_steps), \"Training model\", leave=True):\n        batch = next(iterator)\n        optimizer.zero_grad()\n\n        with model.cache():\n            loss = torch.zeros((), device=\"cpu\")\n            for name, component in model.trainable_pipes():\n                output = component.module_forward(batch[component.name])\n                if \"loss\" in output:\n                    loss += output[\"loss\"]\n\n            loss.backward()\n\n            optimizer.step()\n\n        if (step % 100) == 0:\n            with model.select_pipes(enable=[\"classifier\"]):\n                print(score(val_docs, model.pipe(deepcopy(val_docs))))\n            model.save(\"model\")\n\n    return model\n\n\nif __name__ == \"__main__\":\n    app()\n
                                                                                                                                                                      python train.py --seed 42\n

                                                                                                                                                                      At the end of the training, the pipeline is ready to use (with the .pipe method) since every trained component of the pipeline is self-sufficient, ie contains the preprocessing, inference and postprocessing code required to run it.

                                                                                                                                                                      "},{"location":"recipes/training/#configuration","title":"Configuration","text":"

                                                                                                                                                                      To decouple the configuration and the code of our training script, let's define a configuration file where we will describe both our training parameters and the pipeline. You can either write the config of the pipeline by hand, or generate it from an instantiated pipeline by running:

                                                                                                                                                                      print(pipeline.config.to_str())\n
                                                                                                                                                                      Custom architecturePretrained Huggingface Transformer config.cfg
                                                                                                                                                                      # This is this equivalent of the API-based declaration at the beginning of the tutorial\n[pipeline]\npipeline = [\"extractor\", \"embedding\", \"classifier\"]\ndisabled = []\ncomponents = ${components}\n\n[components]\n\n[components.extractor]\n@factory = \"pdfminer-extractor\"\n\n[components.embedding]\n@factory = \"box-transformer\"\nnum_heads = 4\ndropout_p = 0.1\nactivation = \"gelu\"\ninit_resweight = 0.01\nhead_size = 16\nattention_mode = [\"c2c\", \"c2p\", \"p2c\"]\nn_layers = 1\nn_relative_positions = 64\n\n[components.embedding.embedding]\n@factory = \"embedding-combiner\"\ndropout_p = 0.1\n\n[components.embedding.embedding.text_encoder]\n@factory = \"sub-box-cnn-pooler\"\nout_channels = 64\nkernel_sizes = (3, 4, 5)\n\n[components.embedding.embedding.text_encoder.embedding]\n@factory = \"simple-text-embedding\"\nsize = 72\n\n[components.embedding.embedding.layout_encoder]\n@factory = \"box-layout-embedding\"\nn_positions = 64\nx_mode = \"learned\"\ny_mode = \"learned\"\nw_mode = \"learned\"\nh_mode = \"learned\"\nsize = 72\n\n[components.classifier]\n@factory = \"trainable-classifier\"\nembedding = ${components.embedding}\nlabels = []\n\n# This is were we define the training script parameters\n# the \"train\" section refers to the name of the command in the training script\n[train]\nmodel = ${pipeline}\ntrain_data = {\"@adapter\": \"my-segmentation-adapter\", \"path\": \"data/train\"}\nval_data = {\"@adapter\": \"my-segmentation-adapter\", \"path\": \"data/val\"}\nmax_steps = 1000\nseed = 42\nlr = 3e-4\nbatch_size = 4\n
                                                                                                                                                                      config.cfg
                                                                                                                                                                      [pipeline]\npipeline = [\"extractor\", \"embedding\", \"classifier\"]\ndisabled = []\ncomponents = ${components}\n\n[components]\n\n[components.extractor]\n@factory = \"mupdf-extractor\"\nrender_pages = true\n\n[components.embedding]\n@factory = \"huggingface-embedding\"\nmodel = \"microsoft/layoutlmv3-base\"\nuse_image = false\nwindow = 128\nstride = 64\nline_pooling = \"mean\"\n\n[components.classifier]\n@factory = \"trainable-classifier\"\nembedding = ${components.embedding}\nlabels = []\n\n[train]\nmodel = ${pipeline}\nmax_steps = 1000\nlr = 5e-5\nseed = 42\ntrain_data = {\"@adapter\": \"my-segmentation-adapter\", \"path\": \"data/train\"}\nval_data = {\"@adapter\": \"my-segmentation-adapter\", \"path\": \"data/val\"}\nbatch_size = 8\n

                                                                                                                                                                      and update our training script to use the pipeline and the data adapters defined in the configuration file instead of the Python declaration :

                                                                                                                                                                      @app.command(name=\"train\")\ndef train_my_model(\n+   model: Pipeline,\n+   train_path: DirectoryPath = \"data/train\",\n-   train_data: Callable = segmentation_adapter(\"data/train\"),\n+   val_path: DirectoryPath = \"data/val\",\n-   val_data: Callable = segmentation_adapter(\"data/val\"),\n   seed: int = 42,\n    max_steps: int = 1000,\n    batch_size: int = 4,\n    lr: float = 3e-4,\n):\n    # Seed will be set by the CLI util, before `model` is instanciated\n-   set_seed(seed)\n\n   # Model will be defined from the config file using registries\n-   model = Pipeline()\n-   model.add_pipe(\"mupdf-extractor\", name=\"extractor\")\n-   model.add_pipe(\n-       \"box-transformer\",\n-       name=\"embedding\",\n-       config={\n-           \"num_heads\": 4,\n-           \"dropout_p\": 0.1,\n-           \"activation\": \"gelu\",\n-           \"init_resweight\": 0.01,\n-           \"head_size\": 16,\n-           \"attention_mode\": [\"c2c\", \"c2p\", \"p2c\"],\n-           \"n_layers\": 1,\n-           \"n_relative_positions\": 64,\n-           \"embedding\": {\n-               \"@factory\": \"embedding-combiner\",\n-               \"dropout_p\": 0.1,\n-               \"text_encoder\": {\n-                   \"@factory\": \"sub-box-cnn-pooler\",\n-                   \"out_channels\": 64,\n-                   \"kernel_sizes\": (3, 4, 5),\n-                   \"embedding\": {\n-                       \"@factory\": \"simple-text-embedding\",\n-                       \"size\": 72,\n-                   },\n-               },\n-               \"layout_encoder\": {\n-                   \"@factory\": \"box-layout-embedding\",\n-                   \"n_positions\": 64,\n-                   \"x_mode\": \"learned\",\n-                   \"y_mode\": \"learned\",\n-                   \"w_mode\": \"learned\",\n-                   \"h_mode\": \"learned\",\n-                   \"size\": 72,\n-               },\n-           },\n-       },\n-   )\n-   model.add_pipe(\n-       \"trainable-classifier\",\n-       name=\"classifier\",\n-       config={\n-           \"embedding\": model.get_pipe(\"embedding\"),\n-           \"labels\": [],\n-       },\n-   )\n\n   # Loading and adapting the training and validation data\n-    train_docs = list(segmentation_adapter(train_path)(model))\n+    train_docs = list(train_data(model))\n-    val_docs = list(segmentation_adapter(val_path)(model))\n+    val_docs = list(val_data(model))\n\n   # Taking the first `initialization_subset` samples to initialize the model\n    ...\n

                                                                                                                                                                      That's it ! We can now call the training script with the configuration file as a parameter, and override some of its defaults values:

                                                                                                                                                                      python train.py --config config.cfg --components.extractor.extract_styles=true --seed 43\n
                                                                                                                                                                      "},{"location":"reference/edspdf/","title":"edspdf","text":""},{"location":"reference/edspdf/pipeline/","title":"edspdf.pipeline","text":""},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline","title":"Pipeline","text":"

                                                                                                                                                                      Pipeline to build hybrid and multitask PDF processing pipeline. It uses PyTorch as the deep-learning backend and allows components to share subcomponents.

                                                                                                                                                                      See the documentation for more details.

                                                                                                                                                                      "},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION batch_size

                                                                                                                                                                      Batch size to use in the .pipe() method

                                                                                                                                                                      TYPE: Optional[int] DEFAULT: 4

                                                                                                                                                                      meta

                                                                                                                                                                      Meta information about the pipeline

                                                                                                                                                                      TYPE: Dict[str, Any] DEFAULT: None

                                                                                                                                                                      "},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.disabled","title":"disabled property","text":"

                                                                                                                                                                      The names of the disabled components

                                                                                                                                                                      "},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.cfg","title":"cfg: Config property","text":"

                                                                                                                                                                      Returns the config of the pipeline, including the config of all components. Updated from spacy to allow references between components.

                                                                                                                                                                      "},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.get_pipe","title":"get_pipe","text":"

                                                                                                                                                                      Get a component by its name.

                                                                                                                                                                      PARAMETER DESCRIPTION name

                                                                                                                                                                      The name of the component to get.

                                                                                                                                                                      TYPE: str

                                                                                                                                                                      RETURNS DESCRIPTION Pipe"},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.has_pipe","title":"has_pipe","text":"

                                                                                                                                                                      Check if a component exists in the pipeline.

                                                                                                                                                                      PARAMETER DESCRIPTION name

                                                                                                                                                                      The name of the component to check.

                                                                                                                                                                      TYPE: str

                                                                                                                                                                      RETURNS DESCRIPTION bool"},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.create_pipe","title":"create_pipe","text":"

                                                                                                                                                                      Create a component from a factory name.

                                                                                                                                                                      PARAMETER DESCRIPTION factory

                                                                                                                                                                      The name of the factory to use

                                                                                                                                                                      TYPE: str

                                                                                                                                                                      name

                                                                                                                                                                      The name of the component

                                                                                                                                                                      TYPE: str

                                                                                                                                                                      config

                                                                                                                                                                      The config to pass to the factory

                                                                                                                                                                      TYPE: Dict[str, Any] DEFAULT: None

                                                                                                                                                                      RETURNS DESCRIPTION Pipe"},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.add_pipe","title":"add_pipe","text":"

                                                                                                                                                                      Add a component to the pipeline.

                                                                                                                                                                      PARAMETER DESCRIPTION factory

                                                                                                                                                                      The name of the component to add or the component itself

                                                                                                                                                                      TYPE: Union[str, Pipe]

                                                                                                                                                                      name

                                                                                                                                                                      The name of the component. If not provided, the name of the component will be used if it has one (.name), otherwise the factory name will be used.

                                                                                                                                                                      TYPE: Optional[str] DEFAULT: None

                                                                                                                                                                      first

                                                                                                                                                                      Whether to add the component to the beginning of the pipeline. This argument is mutually exclusive with before and after.

                                                                                                                                                                      TYPE: bool DEFAULT: False

                                                                                                                                                                      before

                                                                                                                                                                      The name of the component to add the new component before. This argument is mutually exclusive with after and first.

                                                                                                                                                                      TYPE: Optional[str] DEFAULT: None

                                                                                                                                                                      after

                                                                                                                                                                      The name of the component to add the new component after. This argument is mutually exclusive with before and first.

                                                                                                                                                                      TYPE: Optional[str] DEFAULT: None

                                                                                                                                                                      config

                                                                                                                                                                      The arguments to pass to the component factory.

                                                                                                                                                                      Note that instead of replacing arguments with the same keys, the config will be merged with the default config of the component. This means that you can override specific nested arguments without having to specify the entire config.

                                                                                                                                                                      TYPE: Optional[Dict[str, Any]] DEFAULT: None

                                                                                                                                                                      RETURNS DESCRIPTION Pipe

                                                                                                                                                                      The component that was added to the pipeline.

                                                                                                                                                                      "},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.__call__","title":"__call__","text":"

                                                                                                                                                                      Apply each component successively on a document.

                                                                                                                                                                      PARAMETER DESCRIPTION doc

                                                                                                                                                                      The doc to create the PDFDoc from, or a PDFDoc.

                                                                                                                                                                      TYPE: Any

                                                                                                                                                                      RETURNS DESCRIPTION PDFDoc"},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.pipe","title":"pipe","text":"

                                                                                                                                                                      Process a stream of documents by applying each component successively on batches of documents.

                                                                                                                                                                      PARAMETER DESCRIPTION inputs

                                                                                                                                                                      The inputs to create the PDFDocs from, or the PDFDocs directly.

                                                                                                                                                                      TYPE: Any

                                                                                                                                                                      batch_size

                                                                                                                                                                      The batch size to use. If not provided, the batch size of the pipeline object will be used.

                                                                                                                                                                      TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                      accelerator

                                                                                                                                                                      The accelerator to use for processing the documents. If not provided, the default accelerator will be used.

                                                                                                                                                                      TYPE: Optional[Union[str, Accelerator]] DEFAULT: None

                                                                                                                                                                      to_doc

                                                                                                                                                                      The function to use to convert the inputs to PDFDoc objects. By default, the content field of the inputs will be used if dict-like objects are provided, otherwise the inputs will be passed directly to the pipeline.

                                                                                                                                                                      TYPE: Optional[ToDoc] DEFAULT: None

                                                                                                                                                                      from_doc

                                                                                                                                                                      The function to use to convert the PDFDoc objects to outputs. By default, the PDFDoc objects will be returned directly.

                                                                                                                                                                      TYPE: FromDoc DEFAULT: lambda : doc

                                                                                                                                                                      RETURNS DESCRIPTION Iterable[PDFDoc]"},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.cache","title":"cache","text":"

                                                                                                                                                                      Enable caching for all (trainable) components in the pipeline

                                                                                                                                                                      "},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.trainable_pipes","title":"trainable_pipes","text":"

                                                                                                                                                                      Yields components that are PyTorch modules.

                                                                                                                                                                      PARAMETER DESCRIPTION disable

                                                                                                                                                                      The names of disabled components, which will be skipped.

                                                                                                                                                                      TYPE: Sequence[str] DEFAULT: ()

                                                                                                                                                                      RETURNS DESCRIPTION Iterable[Tuple[str, TrainablePipe]]"},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.post_init","title":"post_init","text":"

                                                                                                                                                                      Completes the initialization of the pipeline by calling the post_init method of all components that have one. This is useful for components that need to see some data to build their vocabulary, for instance.

                                                                                                                                                                      PARAMETER DESCRIPTION gold_data

                                                                                                                                                                      The documents to use for initialization. Each component will not necessarily see all the data.

                                                                                                                                                                      TYPE: Iterable[PDFDoc]

                                                                                                                                                                      exclude

                                                                                                                                                                      The names of components to exclude from initialization. This argument will be gradually updated with the names of initialized components

                                                                                                                                                                      TYPE: Optional[set] DEFAULT: None

                                                                                                                                                                      "},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.from_config","title":"from_config classmethod","text":"

                                                                                                                                                                      Create a pipeline from a config object

                                                                                                                                                                      PARAMETER DESCRIPTION config

                                                                                                                                                                      The config to use

                                                                                                                                                                      TYPE: Dict[str, Any] DEFAULT: {}

                                                                                                                                                                      disable

                                                                                                                                                                      Components to disable

                                                                                                                                                                      TYPE: Optional[Set[str]] DEFAULT: None

                                                                                                                                                                      enable

                                                                                                                                                                      Components to enable

                                                                                                                                                                      TYPE: Optional[Set[str]] DEFAULT: None

                                                                                                                                                                      exclude

                                                                                                                                                                      Components to exclude

                                                                                                                                                                      TYPE: Optional[Set[str]] DEFAULT: None

                                                                                                                                                                      meta

                                                                                                                                                                      Metadata to add to the pipeline

                                                                                                                                                                      TYPE: Optional[Dict[str, Any]] DEFAULT: None

                                                                                                                                                                      RETURNS DESCRIPTION Pipeline"},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.__get_validators__","title":"__get_validators__ classmethod","text":"

                                                                                                                                                                      Pydantic validators generator

                                                                                                                                                                      "},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.validate","title":"validate classmethod","text":"

                                                                                                                                                                      Pydantic validator, used in the validate_arguments decorated functions

                                                                                                                                                                      "},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.preprocess","title":"preprocess","text":"

                                                                                                                                                                      Run the preprocessing methods of each component in the pipeline on a document and returns a dictionary containing the results, with the component names as keys.

                                                                                                                                                                      PARAMETER DESCRIPTION doc

                                                                                                                                                                      The document to preprocess

                                                                                                                                                                      TYPE: PDFDoc

                                                                                                                                                                      supervision

                                                                                                                                                                      Whether to include supervision information in the preprocessing

                                                                                                                                                                      TYPE: bool DEFAULT: False

                                                                                                                                                                      RETURNS DESCRIPTION Dict[str, Any]"},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.preprocess_many","title":"preprocess_many","text":"

                                                                                                                                                                      Runs the preprocessing methods of each component in the pipeline on a collection of documents and returns an iterable of dictionaries containing the results, with the component names as keys.

                                                                                                                                                                      PARAMETER DESCRIPTION docs

                                                                                                                                                                      TYPE: Iterable[PDFDoc]

                                                                                                                                                                      compress

                                                                                                                                                                      Whether to deduplicate identical preprocessing outputs of the results if multiple documents share identical subcomponents. This step is required to enable the cache mechanism when training or running the pipeline over a tabular datasets such as pyarrow tables that do not store referential equality information.

                                                                                                                                                                      DEFAULT: True

                                                                                                                                                                      supervision

                                                                                                                                                                      Whether to include supervision information in the preprocessing

                                                                                                                                                                      DEFAULT: True

                                                                                                                                                                      RETURNS DESCRIPTION Iterable[OutputT]"},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.collate","title":"collate","text":"

                                                                                                                                                                      Collates a batch of preprocessed samples into a single (maybe nested) dictionary of tensors by calling the collate method of each component.

                                                                                                                                                                      PARAMETER DESCRIPTION batch

                                                                                                                                                                      The batch of preprocessed samples

                                                                                                                                                                      TYPE: List[Dict[str, Any]]

                                                                                                                                                                      device

                                                                                                                                                                      The device to move the tensors to before returning them

                                                                                                                                                                      TYPE: Optional[device] DEFAULT: None

                                                                                                                                                                      RETURNS DESCRIPTION Dict[str, Any]

                                                                                                                                                                      The collated batch

                                                                                                                                                                      "},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.parameters","title":"parameters","text":"

                                                                                                                                                                      Returns an iterator over the Pytorch parameters of the components in the pipeline

                                                                                                                                                                      "},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.named_parameters","title":"named_parameters","text":"

                                                                                                                                                                      Returns an iterator over the Pytorch parameters of the components in the pipeline

                                                                                                                                                                      "},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.to","title":"to","text":"

                                                                                                                                                                      Moves the pipeline to a given device

                                                                                                                                                                      "},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.train","title":"train","text":"

                                                                                                                                                                      Enables training mode on pytorch modules

                                                                                                                                                                      PARAMETER DESCRIPTION mode

                                                                                                                                                                      Whether to enable training or not

                                                                                                                                                                      DEFAULT: True

                                                                                                                                                                      "},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.save","title":"save","text":"

                                                                                                                                                                      Save the pipeline to a directory.

                                                                                                                                                                      PARAMETER DESCRIPTION path

                                                                                                                                                                      The path to the directory to save the pipeline to. Every component will be saved to separated subdirectories of this directory, except for tensors that will be saved to a shared files depending on the references between the components.

                                                                                                                                                                      TYPE: Union[str, Path]

                                                                                                                                                                      exclude

                                                                                                                                                                      The names of the components, or attributes to exclude from the saving process. This list will be gradually filled in place as components are saved

                                                                                                                                                                      TYPE: Optional[Set[str]] DEFAULT: None

                                                                                                                                                                      "},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.load_state_from_disk","title":"load_state_from_disk","text":"

                                                                                                                                                                      Load the pipeline from a directory. Components will be updated in-place.

                                                                                                                                                                      PARAMETER DESCRIPTION path

                                                                                                                                                                      The path to the directory to load the pipeline from

                                                                                                                                                                      TYPE: Union[str, Path]

                                                                                                                                                                      exclude

                                                                                                                                                                      The names of the components, or attributes to exclude from the loading process. This list will be gradually filled in place as components are loaded

                                                                                                                                                                      TYPE: Set[str] DEFAULT: None

                                                                                                                                                                      "},{"location":"reference/edspdf/pipeline/#edspdf.pipeline.Pipeline.select_pipes","title":"select_pipes","text":"

                                                                                                                                                                      Temporarily disable and enable components in the pipeline.

                                                                                                                                                                      PARAMETER DESCRIPTION disable

                                                                                                                                                                      The name of the component to disable, or a list of names.

                                                                                                                                                                      TYPE: Optional[Union[str, Iterable[str]]] DEFAULT: None

                                                                                                                                                                      enable

                                                                                                                                                                      The name of the component to enable, or a list of names.

                                                                                                                                                                      TYPE: Optional[Union[str, Iterable[str]]] DEFAULT: None

                                                                                                                                                                      "},{"location":"reference/edspdf/registry/","title":"edspdf.registry","text":""},{"location":"reference/edspdf/registry/#edspdf.registry.CurriedFactory","title":"CurriedFactory","text":""},{"location":"reference/edspdf/registry/#edspdf.registry.CurriedFactory.instantiate","title":"instantiate","text":"

                                                                                                                                                                      We need to support passing in the pipeline object and name to factories from a config file. Since components can be nested, we need to add them to every factory in the config.

                                                                                                                                                                      "},{"location":"reference/edspdf/registry/#edspdf.registry.FactoryRegistry","title":"FactoryRegistry","text":"

                                                                                                                                                                      Bases: Registry

                                                                                                                                                                      A registry that validates the input arguments of the registered functions.

                                                                                                                                                                      "},{"location":"reference/edspdf/registry/#edspdf.registry.FactoryRegistry.get","title":"get","text":"

                                                                                                                                                                      Get the registered function for a given name.

                                                                                                                                                                      name (str): The name. RETURNS (Any): The registered function.

                                                                                                                                                                      "},{"location":"reference/edspdf/registry/#edspdf.registry.FactoryRegistry.register","title":"register","text":"

                                                                                                                                                                      This is a convenience wrapper around confit.Registry.register, that curries the function to be registered, allowing to instantiate the class later once pipeline and name are known.

                                                                                                                                                                      PARAMETER DESCRIPTION name

                                                                                                                                                                      TYPE: str

                                                                                                                                                                      func

                                                                                                                                                                      TYPE: Optional[InFunc] DEFAULT: None

                                                                                                                                                                      default_config

                                                                                                                                                                      TYPE: Dict[str, Any] DEFAULT: FrozenDict()

                                                                                                                                                                      assigns

                                                                                                                                                                      TYPE: Iterable[str] DEFAULT: FrozenList()

                                                                                                                                                                      requires

                                                                                                                                                                      TYPE: Iterable[str] DEFAULT: FrozenList()

                                                                                                                                                                      retokenizes

                                                                                                                                                                      TYPE: bool DEFAULT: False

                                                                                                                                                                      default_score_weights

                                                                                                                                                                      TYPE: Dict[str, Optional[float]] DEFAULT: FrozenDict()

                                                                                                                                                                      RETURNS DESCRIPTION Callable[[InFunc], InFunc]"},{"location":"reference/edspdf/registry/#edspdf.registry.accepted_arguments","title":"accepted_arguments","text":"

                                                                                                                                                                      Checks that a function accepts a list of keyword arguments

                                                                                                                                                                      PARAMETER DESCRIPTION func

                                                                                                                                                                      Function to check

                                                                                                                                                                      TYPE: Callable

                                                                                                                                                                      args

                                                                                                                                                                      Argument or list of arguments to check

                                                                                                                                                                      TYPE: Sequence[str]

                                                                                                                                                                      RETURNS DESCRIPTION List[str]"},{"location":"reference/edspdf/structures/","title":"edspdf.structures","text":""},{"location":"reference/edspdf/structures/#edspdf.structures.PDFDoc","title":"PDFDoc","text":"

                                                                                                                                                                      Bases: BaseModel

                                                                                                                                                                      This is the main data structure of the library to hold PDFs. It contains the content of the PDF, as well as box annotations and text outputs.

                                                                                                                                                                      ATTRIBUTE DESCRIPTION content

                                                                                                                                                                      The content of the PDF document.

                                                                                                                                                                      TYPE: bytes

                                                                                                                                                                      id

                                                                                                                                                                      The ID of the PDF document.

                                                                                                                                                                      TYPE: (str, optional)

                                                                                                                                                                      pages

                                                                                                                                                                      The pages of the PDF document.

                                                                                                                                                                      TYPE: List[Page]

                                                                                                                                                                      error

                                                                                                                                                                      Whether there was an error when processing this PDF document.

                                                                                                                                                                      TYPE: (bool, optional)

                                                                                                                                                                      content_boxes

                                                                                                                                                                      The content boxes/annotations of the PDF document.

                                                                                                                                                                      TYPE: List[Union[TextBox, ImageBox]]

                                                                                                                                                                      aggregated_texts

                                                                                                                                                                      The aggregated text outputs of the PDF document.

                                                                                                                                                                      TYPE: Dict[str, Text]

                                                                                                                                                                      text_boxes

                                                                                                                                                                      The text boxes of the PDF document.

                                                                                                                                                                      TYPE: List[TextBox]

                                                                                                                                                                      "},{"location":"reference/edspdf/structures/#edspdf.structures.Page","title":"Page","text":"

                                                                                                                                                                      Bases: BaseModel

                                                                                                                                                                      The Page class represents a page of a PDF document.

                                                                                                                                                                      ATTRIBUTE DESCRIPTION page_num

                                                                                                                                                                      The page number of the page.

                                                                                                                                                                      TYPE: int

                                                                                                                                                                      width

                                                                                                                                                                      The width of the page.

                                                                                                                                                                      TYPE: float

                                                                                                                                                                      height

                                                                                                                                                                      The height of the page.

                                                                                                                                                                      TYPE: float

                                                                                                                                                                      doc

                                                                                                                                                                      The PDF document that this page belongs to.

                                                                                                                                                                      TYPE: PDFDoc

                                                                                                                                                                      image

                                                                                                                                                                      The rendered image of the page, stored as a NumPy array.

                                                                                                                                                                      TYPE: Optional[ndarray]

                                                                                                                                                                      text_boxes

                                                                                                                                                                      The text boxes of the page.

                                                                                                                                                                      TYPE: List[TextBox]

                                                                                                                                                                      "},{"location":"reference/edspdf/structures/#edspdf.structures.TextProperties","title":"TextProperties","text":"

                                                                                                                                                                      Bases: BaseModel

                                                                                                                                                                      The TextProperties class represents the style properties of a span of text in a TextBox.

                                                                                                                                                                      ATTRIBUTE DESCRIPTION italic

                                                                                                                                                                      Whether the text is italic.

                                                                                                                                                                      TYPE: bool

                                                                                                                                                                      bold

                                                                                                                                                                      Whether the text is bold.

                                                                                                                                                                      TYPE: bool

                                                                                                                                                                      begin

                                                                                                                                                                      The beginning index of the span of text.

                                                                                                                                                                      TYPE: int

                                                                                                                                                                      end

                                                                                                                                                                      The ending index of the span of text.

                                                                                                                                                                      TYPE: int

                                                                                                                                                                      fontname

                                                                                                                                                                      The font name of the span of text.

                                                                                                                                                                      TYPE: Optional[str]

                                                                                                                                                                      "},{"location":"reference/edspdf/structures/#edspdf.structures.Box","title":"Box","text":"

                                                                                                                                                                      Bases: BaseModel

                                                                                                                                                                      The Box class represents a box annotation in a PDF document. It is the base class of TextBox.

                                                                                                                                                                      ATTRIBUTE DESCRIPTION doc

                                                                                                                                                                      The PDF document that this box belongs to.

                                                                                                                                                                      TYPE: PDFDoc

                                                                                                                                                                      page_num

                                                                                                                                                                      The page number of the box.

                                                                                                                                                                      TYPE: Optional[int]

                                                                                                                                                                      x0

                                                                                                                                                                      The left x-coordinate of the box.

                                                                                                                                                                      TYPE: float

                                                                                                                                                                      x1

                                                                                                                                                                      The right x-coordinate of the box.

                                                                                                                                                                      TYPE: float

                                                                                                                                                                      y0

                                                                                                                                                                      The top y-coordinate of the box.

                                                                                                                                                                      TYPE: float

                                                                                                                                                                      y1

                                                                                                                                                                      The bottom y-coordinate of the box.

                                                                                                                                                                      TYPE: float

                                                                                                                                                                      label

                                                                                                                                                                      The label of the box.

                                                                                                                                                                      TYPE: Optional[str]

                                                                                                                                                                      page

                                                                                                                                                                      The page object that this box belongs to.

                                                                                                                                                                      TYPE: Page

                                                                                                                                                                      "},{"location":"reference/edspdf/structures/#edspdf.structures.Text","title":"Text","text":"

                                                                                                                                                                      Bases: BaseModel

                                                                                                                                                                      The TextBox class represents text object, not bound to any box.

                                                                                                                                                                      It can be used to store aggregated text from multiple boxes for example.

                                                                                                                                                                      ATTRIBUTE DESCRIPTION text

                                                                                                                                                                      The text content.

                                                                                                                                                                      TYPE: str

                                                                                                                                                                      properties

                                                                                                                                                                      The style properties of the text.

                                                                                                                                                                      TYPE: List[TextProperties]

                                                                                                                                                                      "},{"location":"reference/edspdf/structures/#edspdf.structures.TextBox","title":"TextBox","text":"

                                                                                                                                                                      Bases: Box

                                                                                                                                                                      The TextBox class represents a text box annotation in a PDF document.

                                                                                                                                                                      ATTRIBUTE DESCRIPTION text

                                                                                                                                                                      The text content of the text box.

                                                                                                                                                                      TYPE: str

                                                                                                                                                                      props

                                                                                                                                                                      The style properties of the text box.

                                                                                                                                                                      TYPE: List[TextProperties]

                                                                                                                                                                      "},{"location":"reference/edspdf/trainable_pipe/","title":"edspdf.trainable_pipe","text":""},{"location":"reference/edspdf/trainable_pipe/#edspdf.trainable_pipe.TrainablePipe","title":"TrainablePipe","text":"

                                                                                                                                                                      Bases: Module, Generic[OutputBatch]

                                                                                                                                                                      A TrainablePipe is a Component that can be trained and inherits torch.nn.Module. You can use it either as a torch module inside a more complex neural network, or as a standalone component in a Pipeline.

                                                                                                                                                                      In addition to the methods of a torch module, a TrainablePipe adds a few methods to handle preprocessing and collating features, as well as caching intermediate results for components that share a common subcomponent.

                                                                                                                                                                      "},{"location":"reference/edspdf/trainable_pipe/#edspdf.trainable_pipe.TrainablePipe.save_extra_data","title":"save_extra_data","text":"

                                                                                                                                                                      Dumps vocabularies indices to json files

                                                                                                                                                                      PARAMETER DESCRIPTION path

                                                                                                                                                                      Path to the directory where the files will be saved

                                                                                                                                                                      TYPE: Path

                                                                                                                                                                      exclude

                                                                                                                                                                      The set of component names to exclude from saving This is useful when components are repeated in the pipeline.

                                                                                                                                                                      TYPE: set

                                                                                                                                                                      "},{"location":"reference/edspdf/trainable_pipe/#edspdf.trainable_pipe.TrainablePipe.load_extra_data","title":"load_extra_data","text":"

                                                                                                                                                                      Loads vocabularies indices from json files

                                                                                                                                                                      PARAMETER DESCRIPTION path

                                                                                                                                                                      Path to the directory where the files will be loaded

                                                                                                                                                                      TYPE: Path

                                                                                                                                                                      exclude

                                                                                                                                                                      The set of component names to exclude from loading This is useful when components are repeated in the pipeline.

                                                                                                                                                                      TYPE: set

                                                                                                                                                                      "},{"location":"reference/edspdf/trainable_pipe/#edspdf.trainable_pipe.TrainablePipe.post_init","title":"post_init","text":"

                                                                                                                                                                      This method completes the attributes of the component, by looking at some documents. It is especially useful to build vocabularies or detect the labels of a classification task.

                                                                                                                                                                      PARAMETER DESCRIPTION gold_data

                                                                                                                                                                      The documents to use for initialization.

                                                                                                                                                                      TYPE: Iterable[PDFDoc]

                                                                                                                                                                      exclude

                                                                                                                                                                      The names of components to exclude from initialization. This argument will be gradually updated with the names of initialized components

                                                                                                                                                                      TYPE: set

                                                                                                                                                                      "},{"location":"reference/edspdf/trainable_pipe/#edspdf.trainable_pipe.TrainablePipe.preprocess","title":"preprocess","text":"

                                                                                                                                                                      Preprocess the document to extract features that will be used by the neural network to perform its predictions.

                                                                                                                                                                      PARAMETER DESCRIPTION doc

                                                                                                                                                                      PDFDocument to preprocess

                                                                                                                                                                      TYPE: PDFDoc

                                                                                                                                                                      RETURNS DESCRIPTION Dict[str, Any]

                                                                                                                                                                      Dictionary (optionally nested) containing the features extracted from the document.

                                                                                                                                                                      "},{"location":"reference/edspdf/trainable_pipe/#edspdf.trainable_pipe.TrainablePipe.collate","title":"collate","text":"

                                                                                                                                                                      Collate the batch of features into a single batch of tensors that can be used by the forward method of the component.

                                                                                                                                                                      PARAMETER DESCRIPTION batch

                                                                                                                                                                      Batch of features

                                                                                                                                                                      TYPE: NestedSequences

                                                                                                                                                                      device

                                                                                                                                                                      Device on which the tensors should be moved

                                                                                                                                                                      TYPE: device

                                                                                                                                                                      RETURNS DESCRIPTION InputBatch

                                                                                                                                                                      Dictionary (optionally nested) containing the collated tensors

                                                                                                                                                                      "},{"location":"reference/edspdf/trainable_pipe/#edspdf.trainable_pipe.TrainablePipe.forward","title":"forward","text":"

                                                                                                                                                                      Perform the forward pass of the neural network, i.e, apply transformations over the collated features to compute new embeddings, probabilities, losses, etc

                                                                                                                                                                      PARAMETER DESCRIPTION batch

                                                                                                                                                                      Batch of tensors (nested dictionary) computed by the collate method

                                                                                                                                                                      TYPE: InputBatch

                                                                                                                                                                      RETURNS DESCRIPTION OutputBatch"},{"location":"reference/edspdf/trainable_pipe/#edspdf.trainable_pipe.TrainablePipe.module_forward","title":"module_forward","text":"

                                                                                                                                                                      This is a wrapper around torch.nn.Module.__call__ to avoid conflict with the TrainablePipe.__call__ method.

                                                                                                                                                                      "},{"location":"reference/edspdf/trainable_pipe/#edspdf.trainable_pipe.TrainablePipe.make_batch","title":"make_batch","text":"

                                                                                                                                                                      Convenience method to preprocess a batch of documents and collate them Features corresponding to the same path are grouped together in a list, under the same key.

                                                                                                                                                                      PARAMETER DESCRIPTION docs

                                                                                                                                                                      Batch of documents

                                                                                                                                                                      TYPE: Sequence[PDFDoc]

                                                                                                                                                                      supervision

                                                                                                                                                                      Whether to extract supervision features or not

                                                                                                                                                                      TYPE: bool DEFAULT: False

                                                                                                                                                                      RETURNS DESCRIPTION Dict[str, Sequence[Any]]"},{"location":"reference/edspdf/trainable_pipe/#edspdf.trainable_pipe.TrainablePipe.batch_process","title":"batch_process","text":"

                                                                                                                                                                      Process a batch of documents using the neural network. This differs from the pipe method in that it does not return an iterator, but executes the component on the whole batch at once.

                                                                                                                                                                      PARAMETER DESCRIPTION docs

                                                                                                                                                                      Batch of documents

                                                                                                                                                                      TYPE: Sequence[PDFDoc]

                                                                                                                                                                      RETURNS DESCRIPTION Sequence[PDFDoc]

                                                                                                                                                                      Batch of updated documents

                                                                                                                                                                      "},{"location":"reference/edspdf/trainable_pipe/#edspdf.trainable_pipe.TrainablePipe.postprocess","title":"postprocess","text":"

                                                                                                                                                                      Update the documents with the predictions of the neural network, for instance converting label probabilities into label attributes on the document lines.

                                                                                                                                                                      By default, this is a no-op.

                                                                                                                                                                      PARAMETER DESCRIPTION docs

                                                                                                                                                                      Batch of documents

                                                                                                                                                                      TYPE: Sequence[PDFDoc]

                                                                                                                                                                      batch

                                                                                                                                                                      Batch of predictions, as returned by the forward method

                                                                                                                                                                      TYPE: OutputBatch

                                                                                                                                                                      RETURNS DESCRIPTION Sequence[PDFDoc]"},{"location":"reference/edspdf/trainable_pipe/#edspdf.trainable_pipe.TrainablePipe.preprocess_supervised","title":"preprocess_supervised","text":"

                                                                                                                                                                      Preprocess the document to extract features that will be used by the neural network to perform its training. By default, this returns the same features as the preprocess method.

                                                                                                                                                                      PARAMETER DESCRIPTION doc

                                                                                                                                                                      PDFDocument to preprocess

                                                                                                                                                                      TYPE: PDFDoc

                                                                                                                                                                      RETURNS DESCRIPTION Dict[str, Any]

                                                                                                                                                                      Dictionary (optionally nested) containing the features extracted from the document.

                                                                                                                                                                      "},{"location":"reference/edspdf/trainable_pipe/#edspdf.trainable_pipe.TrainablePipe.__call__","title":"__call__","text":"

                                                                                                                                                                      Applies the component on a single doc. For multiple documents, prefer batch processing via the batch_process method. In general, prefer the Pipeline methods

                                                                                                                                                                      PARAMETER DESCRIPTION doc

                                                                                                                                                                      TYPE: PDFDoc

                                                                                                                                                                      RETURNS DESCRIPTION PDFDoc"},{"location":"reference/edspdf/accelerators/","title":"edspdf.accelerators","text":""},{"location":"reference/edspdf/accelerators/base/","title":"edspdf.accelerators.base","text":""},{"location":"reference/edspdf/accelerators/base/#edspdf.accelerators.base.FromDoc","title":"FromDoc","text":"

                                                                                                                                                                      A FromDoc converter (from a PDFDoc to an arbitrary type) can be either:

                                                                                                                                                                      • a dict mapping field names to doc attributes
                                                                                                                                                                      • a callable that takes a PDFDoc and returns an arbitrary type
                                                                                                                                                                      "},{"location":"reference/edspdf/accelerators/multiprocessing/","title":"edspdf.accelerators.multiprocessing","text":""},{"location":"reference/edspdf/accelerators/multiprocessing/#edspdf.accelerators.multiprocessing.MultiprocessingAccelerator","title":"MultiprocessingAccelerator","text":"

                                                                                                                                                                      Bases: Accelerator

                                                                                                                                                                      If you have multiple CPU cores, and optionally multiple GPUs, we provide a multiprocessing accelerator that allows to run the inference on multiple processes.

                                                                                                                                                                      This accelerator dispatches the batches between multiple workers (data-parallelism), and distribute the computation of a given batch on one or two workers (model-parallelism). This is done by creating two types of workers:

                                                                                                                                                                      • a CPUWorker which handles the non deep-learning components and the preprocessing, collating and postprocessing of deep-learning components
                                                                                                                                                                      • a GPUWorker which handles the forward call of the deep-learning components

                                                                                                                                                                      The advantage of dedicating a worker to the deep-learning components is that it allows to prepare multiple batches in parallel in multiple CPUWorker, and ensure that the GPUWorker never wait for a batch to be ready.

                                                                                                                                                                      The overall architecture described in the following figure, for 3 CPU workers and 2 GPU workers.

                                                                                                                                                                      Here is how a small pipeline with rule-based components and deep-learning components is distributed between the workers:

                                                                                                                                                                      "},{"location":"reference/edspdf/accelerators/multiprocessing/#edspdf.accelerators.multiprocessing.MultiprocessingAccelerator--examples","title":"Examples","text":"
                                                                                                                                                                      docs = list(\n    pipeline.pipe(\n        [content1, content2, ...],\n        accelerator={\n            \"@accelerator\": \"multiprocessing\",\n            \"num_cpu_workers\": 3,\n            \"num_gpu_workers\": 2,\n            \"batch_size\": 8,\n        },\n    )\n)\n
                                                                                                                                                                      "},{"location":"reference/edspdf/accelerators/multiprocessing/#edspdf.accelerators.multiprocessing.MultiprocessingAccelerator--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION batch_size

                                                                                                                                                                      Number of documents to process at a time in a CPU/GPU worker

                                                                                                                                                                      TYPE: int

                                                                                                                                                                      num_cpu_workers

                                                                                                                                                                      Number of CPU workers. A CPU worker handles the non deep-learning components and the preprocessing, collating and postprocessing of deep-learning components.

                                                                                                                                                                      TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                      num_gpu_workers

                                                                                                                                                                      Number of GPU workers. A GPU worker handles the forward call of the deep-learning components.

                                                                                                                                                                      TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                      gpu_pipe_names

                                                                                                                                                                      List of pipe names to accelerate on a GPUWorker, defaults to all pipes that inherit from TrainablePipe

                                                                                                                                                                      TYPE: Optional[List[str]] DEFAULT: None

                                                                                                                                                                      "},{"location":"reference/edspdf/accelerators/multiprocessing/#edspdf.accelerators.multiprocessing.MultiprocessingAccelerator.__call__","title":"__call__","text":"

                                                                                                                                                                      Stream of documents to process. Each document can be a string or a tuple

                                                                                                                                                                      PARAMETER DESCRIPTION inputs

                                                                                                                                                                      TYPE: Iterable[Any]

                                                                                                                                                                      model

                                                                                                                                                                      TYPE: Any

                                                                                                                                                                      YIELDS DESCRIPTION Any

                                                                                                                                                                      Processed outputs of the pipeline

                                                                                                                                                                      "},{"location":"reference/edspdf/accelerators/simple/","title":"edspdf.accelerators.simple","text":""},{"location":"reference/edspdf/accelerators/simple/#edspdf.accelerators.simple.SimpleAccelerator","title":"SimpleAccelerator","text":"

                                                                                                                                                                      Bases: Accelerator

                                                                                                                                                                      This is the simplest accelerator which batches the documents and process each batch on the main process (the one calling .pipe()).

                                                                                                                                                                      "},{"location":"reference/edspdf/accelerators/simple/#edspdf.accelerators.simple.SimpleAccelerator--examples","title":"Examples","text":"
                                                                                                                                                                      docs = list(pipeline.pipe([content1, content2, ...]))\n

                                                                                                                                                                      or, if you want to override the model defined batch size

                                                                                                                                                                      docs = list(pipeline.pipe([content1, content2, ...], batch_size=8))\n

                                                                                                                                                                      which is equivalent to passing a confit dict

                                                                                                                                                                      docs = list(\n    pipeline.pipe(\n        [content1, content2, ...],\n        accelerator={\n            \"@accelerator\": \"simple\",\n            \"batch_size\": 8,\n        },\n    )\n)\n

                                                                                                                                                                      or the instantiated accelerator directly

                                                                                                                                                                      from edspdf.accelerators.simple import SimpleAccelerator\n\naccelerator = SimpleAccelerator(batch_size=8)\ndocs = list(pipeline.pipe([content1, content2, ...], accelerator=accelerator))\n

                                                                                                                                                                      If you have a GPU, make sure to move the model to the appropriate device before calling .pipe(). If you have multiple GPUs, use the multiprocessing accelerator instead.

                                                                                                                                                                      pipeline.to(\"cuda\")\ndocs = list(pipeline.pipe([content1, content2, ...]))\n
                                                                                                                                                                      "},{"location":"reference/edspdf/accelerators/simple/#edspdf.accelerators.simple.SimpleAccelerator--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION batch_size

                                                                                                                                                                      The number of documents to process in each batch.

                                                                                                                                                                      TYPE: int DEFAULT: 32

                                                                                                                                                                      "},{"location":"reference/edspdf/layers/","title":"edspdf.layers","text":""},{"location":"reference/edspdf/layers/box_transformer/","title":"edspdf.layers.box_transformer","text":""},{"location":"reference/edspdf/layers/box_transformer/#edspdf.layers.box_transformer.BoxTransformerLayer","title":"BoxTransformerLayer","text":"

                                                                                                                                                                      Bases: Module

                                                                                                                                                                      BoxTransformerLayer combining a self attention layer and a linear->activation->linear transformation. This layer is used in the BoxTransformerModule module.

                                                                                                                                                                      "},{"location":"reference/edspdf/layers/box_transformer/#edspdf.layers.box_transformer.BoxTransformerLayer--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION input_size

                                                                                                                                                                      Input embedding size

                                                                                                                                                                      TYPE: int

                                                                                                                                                                      num_heads

                                                                                                                                                                      Number of attention heads in the attention layer

                                                                                                                                                                      TYPE: int DEFAULT: 2

                                                                                                                                                                      dropout_p

                                                                                                                                                                      Dropout probability both for the attention layer and embedding projections

                                                                                                                                                                      TYPE: float DEFAULT: 0.0

                                                                                                                                                                      head_size

                                                                                                                                                                      Head sizes of the attention layer

                                                                                                                                                                      TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                      activation

                                                                                                                                                                      Activation function used in the linear->activation->linear transformation

                                                                                                                                                                      TYPE: ActivationFunction DEFAULT: 'gelu'

                                                                                                                                                                      init_resweight

                                                                                                                                                                      Initial weight of the residual gates. At 0, the layer acts (initially) as an identity function, and at 1 as a standard Transformer layer. Initializing with a value close to 0 can help the training converge.

                                                                                                                                                                      TYPE: float DEFAULT: 0.0

                                                                                                                                                                      attention_mode

                                                                                                                                                                      Mode of relative position infused attention layer. See the relative attention documentation for more information.

                                                                                                                                                                      TYPE: Sequence[Literal['c2c', 'c2p', 'p2c']] DEFAULT: ('c2c', 'c2p', 'p2c')

                                                                                                                                                                      position_embedding

                                                                                                                                                                      Position embedding to use as key/query position embedding in the attention computation.

                                                                                                                                                                      TYPE: Optional[Union[FloatTensor, Parameter]] DEFAULT: None

                                                                                                                                                                      "},{"location":"reference/edspdf/layers/box_transformer/#edspdf.layers.box_transformer.BoxTransformerLayer.forward","title":"forward","text":"

                                                                                                                                                                      Forward pass of the BoxTransformerLayer

                                                                                                                                                                      PARAMETER DESCRIPTION embeds

                                                                                                                                                                      Embeddings to contextualize Shape: n_samples * n_keys * input_size

                                                                                                                                                                      TYPE: FloatTensor

                                                                                                                                                                      mask

                                                                                                                                                                      Mask of the embeddings. 0 means padding element. Shape: n_samples * n_keys

                                                                                                                                                                      TYPE: BoolTensor

                                                                                                                                                                      relative_positions

                                                                                                                                                                      Position of the keys relatively to the query elements Shape: n_samples * n_queries * n_keys * n_coordinates (2 for x/y)

                                                                                                                                                                      TYPE: LongTensor

                                                                                                                                                                      no_position_mask

                                                                                                                                                                      Key / query pairs for which the position attention terms should be disabled. Shape: n_samples * n_queries * n_keys

                                                                                                                                                                      TYPE: Optional[BoolTensor] DEFAULT: None

                                                                                                                                                                      RETURNS DESCRIPTION Tuple[FloatTensor, FloatTensor]
                                                                                                                                                                      • Contextualized embeddings Shape: n_samples * n_queries * n_keys
                                                                                                                                                                      • Attention logits Shape: n_samples * n_queries * n_keys * n_heads
                                                                                                                                                                      "},{"location":"reference/edspdf/layers/box_transformer/#edspdf.layers.box_transformer.BoxTransformerModule","title":"BoxTransformerModule","text":"

                                                                                                                                                                      Bases: Module

                                                                                                                                                                      Box Transformer architecture combining a multiple BoxTransformerLayer modules. It is mainly used in BoxTransformer.

                                                                                                                                                                      "},{"location":"reference/edspdf/layers/box_transformer/#edspdf.layers.box_transformer.BoxTransformerModule--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION input_size

                                                                                                                                                                      Input embedding size

                                                                                                                                                                      TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                      num_heads

                                                                                                                                                                      Number of attention heads in the attention layers

                                                                                                                                                                      TYPE: int DEFAULT: 2

                                                                                                                                                                      n_relative_positions

                                                                                                                                                                      Maximum range of embeddable relative positions between boxes (further distances are capped to \u00b1n_relative_positions // 2)

                                                                                                                                                                      TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                      dropout_p

                                                                                                                                                                      Dropout probability both for the attention layers and embedding projections

                                                                                                                                                                      TYPE: float DEFAULT: 0.0

                                                                                                                                                                      head_size

                                                                                                                                                                      Head sizes of the attention layers

                                                                                                                                                                      TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                      activation

                                                                                                                                                                      Activation function used in the linear->activation->linear transformations

                                                                                                                                                                      TYPE: ActivationFunction DEFAULT: 'gelu'

                                                                                                                                                                      init_resweight

                                                                                                                                                                      Initial weight of the residual gates. At 0, the layer acts (initially) as an identity function, and at 1 as a standard Transformer layer. Initializing with a value close to 0 can help the training converge.

                                                                                                                                                                      TYPE: float DEFAULT: 0.0

                                                                                                                                                                      attention_mode

                                                                                                                                                                      Mode of relative position infused attention layer. See the relative attention documentation for more information.

                                                                                                                                                                      TYPE: Sequence[Literal['c2c', 'c2p', 'p2c']] DEFAULT: ('c2c', 'c2p', 'p2c')

                                                                                                                                                                      n_layers

                                                                                                                                                                      Number of layers in the Transformer

                                                                                                                                                                      TYPE: int DEFAULT: 2

                                                                                                                                                                      "},{"location":"reference/edspdf/layers/box_transformer/#edspdf.layers.box_transformer.BoxTransformerModule.forward","title":"forward","text":"

                                                                                                                                                                      Forward pass of the BoxTransformer

                                                                                                                                                                      PARAMETER DESCRIPTION embeds

                                                                                                                                                                      Embeddings to contextualize Shape: n_samples * n_keys * input_size

                                                                                                                                                                      TYPE: FoldedTensor

                                                                                                                                                                      boxes

                                                                                                                                                                      Layout features of the input elements

                                                                                                                                                                      TYPE: Dict

                                                                                                                                                                      RETURNS DESCRIPTION Tuple[FloatTensor, List[FloatTensor]]
                                                                                                                                                                      • Output of the last BoxTransformerLayer Shape: n_samples * n_queries * n_keys
                                                                                                                                                                      • Attention logits of all layers Shape: n_samples * n_queries * n_keys * n_heads
                                                                                                                                                                      "},{"location":"reference/edspdf/layers/relative_attention/","title":"edspdf.layers.relative_attention","text":""},{"location":"reference/edspdf/layers/relative_attention/#edspdf.layers.relative_attention.RelativeAttention","title":"RelativeAttention","text":"

                                                                                                                                                                      Bases: Module

                                                                                                                                                                      A self/cross-attention layer that takes relative position of elements into account to compute the attention weights. When running a relative attention layer, key and queries are represented using content and position embeddings, where position embeddings are retrieved using the relative position of keys relative to queries

                                                                                                                                                                      "},{"location":"reference/edspdf/layers/relative_attention/#edspdf.layers.relative_attention.RelativeAttention--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION size

                                                                                                                                                                      The size of the output embeddings Also serves as default if query_size, pos_size, or key_size is None

                                                                                                                                                                      TYPE: int

                                                                                                                                                                      n_heads

                                                                                                                                                                      The number of attention heads

                                                                                                                                                                      TYPE: int

                                                                                                                                                                      query_size

                                                                                                                                                                      The size of the query embeddings.

                                                                                                                                                                      TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                      key_size

                                                                                                                                                                      The size of the key embeddings.

                                                                                                                                                                      TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                      value_size

                                                                                                                                                                      The size of the value embeddings

                                                                                                                                                                      TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                      head_size

                                                                                                                                                                      The size of each query / key / value chunk used in the attention dot product Default: key_size / n_heads

                                                                                                                                                                      TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                      position_embedding

                                                                                                                                                                      The position embedding used as key and query embeddings

                                                                                                                                                                      TYPE: Optional[Union[FloatTensor, Parameter]] DEFAULT: None

                                                                                                                                                                      dropout_p

                                                                                                                                                                      Dropout probability applied on the attention weights Default: 0.1

                                                                                                                                                                      TYPE: float DEFAULT: 0.0

                                                                                                                                                                      same_key_query_proj

                                                                                                                                                                      Whether to use the same projection operator for content key and queries when computing the pre-attention key and query embedding chunks Default: False

                                                                                                                                                                      TYPE: bool DEFAULT: False

                                                                                                                                                                      same_positional_key_query_proj

                                                                                                                                                                      Whether to use the same projection operator for content key and queries when computing the pre-attention key and query embedding chunks Default: False

                                                                                                                                                                      TYPE: bool DEFAULT: False

                                                                                                                                                                      n_coordinates

                                                                                                                                                                      The number of positional coordinates For instance, text is 1D so 1 coordinate, images are 2D so 2 coordinates ... Default: 1

                                                                                                                                                                      TYPE: int DEFAULT: 1

                                                                                                                                                                      head_bias

                                                                                                                                                                      Whether to learn a bias term to add to the attention logits This is only useful if you plan to use the attention logits for subsequent operations, since attention weights are unaffected by bias terms.

                                                                                                                                                                      TYPE: bool DEFAULT: True

                                                                                                                                                                      do_pooling

                                                                                                                                                                      Whether to compute the output embedding. If you only plan to use attention logits, you should disable this parameter. Default: True

                                                                                                                                                                      TYPE: bool DEFAULT: True

                                                                                                                                                                      mode

                                                                                                                                                                      Whether to compute content to content (c2c), content to position (c2p) or position to content (p2c) attention terms. Setting mode=('c2c\") disable relative position attention terms: this is the standard attention layer. To get a better intuition about these different types of attention, here is a formulation as fictitious search samples from a word in a (1D) text:

                                                                                                                                                                      • content-content : \"my content is \u2019ultrasound\u2019 so I\u2019m looking for other words whose content contains information about temporality\"
                                                                                                                                                                      • content-position: \"my content is \u2019ultrasound\u2019 so I\u2019m looking for other words that are 3 positions after of me\"
                                                                                                                                                                      • position-content : \"regardless of my content, I will attend to the word one position after from me if it contains information about temporality, two words after me if it contains information about location, etc.\"

                                                                                                                                                                      TYPE: Sequence[Literal['c2c', 'c2p', 'p2c']] DEFAULT: ('c2c', 'p2c', 'c2p')

                                                                                                                                                                      n_additional_heads

                                                                                                                                                                      The number of additional head logits to compute. Those are not used to compute output embeddings, but may be useful in subsequent operation. Default: 0

                                                                                                                                                                      TYPE: int DEFAULT: 0

                                                                                                                                                                      "},{"location":"reference/edspdf/layers/relative_attention/#edspdf.layers.relative_attention.RelativeAttention.forward","title":"forward","text":"

                                                                                                                                                                      Forward pass of the RelativeAttention layer.

                                                                                                                                                                      PARAMETER DESCRIPTION content_queries

                                                                                                                                                                      The content query embedding to use in the attention computation Shape: n_samples * n_queries * query_size

                                                                                                                                                                      TYPE: FloatTensor

                                                                                                                                                                      content_keys

                                                                                                                                                                      The content key embedding to use in the attention computation. If None, defaults to the content_queries Shape: n_samples * n_keys * query_size

                                                                                                                                                                      TYPE: Optional[FloatTensor] DEFAULT: None

                                                                                                                                                                      content_values

                                                                                                                                                                      The content values embedding to use in the final pooling computation. If None, pooling won't be performed. Shape: n_samples * n_keys * query_size

                                                                                                                                                                      TYPE: Optional[FloatTensor] DEFAULT: None

                                                                                                                                                                      mask

                                                                                                                                                                      The content key embedding to use in the attention computation. If None, defaults to the content_queries Shape: either - n_samples * n_keys - n_samples * n_queries * n_keys - n_samples * n_queries * n_keys * n_heads

                                                                                                                                                                      TYPE: Optional[BoolTensor] DEFAULT: None

                                                                                                                                                                      relative_positions

                                                                                                                                                                      The relative position of keys relative to queries If None, positional attention terms won't be computed. Shape: n_samples * n_queries * n_keys * n_coordinates

                                                                                                                                                                      TYPE: Optional[LongTensor] DEFAULT: None

                                                                                                                                                                      no_position_mask

                                                                                                                                                                      Key / query pairs for which the position attention terms should be disabled. Shape: n_samples * n_queries * n_keys

                                                                                                                                                                      TYPE: Optional[BoolTensor] DEFAULT: None

                                                                                                                                                                      base_attn

                                                                                                                                                                      Attention logits to add to the computed attention logits Shape: n_samples * n_queries * n_keys * n_heads

                                                                                                                                                                      TYPE: Optional[FloatTensor] DEFAULT: None

                                                                                                                                                                      RETURNS DESCRIPTION Union[Tuple[FloatTensor, FloatTensor], FloatTensor]
                                                                                                                                                                      • the output contextualized embeddings (only if content_values is not None and the do_pooling attribute is set to True) Shape: n_sample * n_keys * size
                                                                                                                                                                      • the attention logits Shape: n_sample * n_keys * n_queries * (n_heads + n_additional_heads)
                                                                                                                                                                      "},{"location":"reference/edspdf/layers/sinusoidal_embedding/","title":"edspdf.layers.sinusoidal_embedding","text":""},{"location":"reference/edspdf/layers/sinusoidal_embedding/#edspdf.layers.sinusoidal_embedding.SinusoidalEmbedding","title":"SinusoidalEmbedding","text":"

                                                                                                                                                                      Bases: Module

                                                                                                                                                                      A position embedding lookup table that stores embeddings for a fixed number of positions. The value of each of the embedding_dim channels of the generated embedding is generated according to a trigonometric function (sin for even channels, cos for odd channels). The frequency of the signal in each pair of channels varies according to the temperature parameter.

                                                                                                                                                                      Any input position above the maximum value num_embeddings will be capped to num_embeddings - 1

                                                                                                                                                                      "},{"location":"reference/edspdf/layers/sinusoidal_embedding/#edspdf.layers.sinusoidal_embedding.SinusoidalEmbedding--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION num_embeddings

                                                                                                                                                                      The maximum number of position embeddings store in this table

                                                                                                                                                                      TYPE: int

                                                                                                                                                                      embedding_dim

                                                                                                                                                                      The embedding size

                                                                                                                                                                      TYPE: int

                                                                                                                                                                      temperature

                                                                                                                                                                      The temperature controls the range of frequencies used by each channel of the embedding

                                                                                                                                                                      TYPE: float DEFAULT: 10000.0

                                                                                                                                                                      "},{"location":"reference/edspdf/layers/sinusoidal_embedding/#edspdf.layers.sinusoidal_embedding.SinusoidalEmbedding.forward","title":"forward","text":"

                                                                                                                                                                      Forward pass of the SinusoidalEmbedding module

                                                                                                                                                                      PARAMETER DESCRIPTION indices

                                                                                                                                                                      Shape: any

                                                                                                                                                                      TYPE: LongTensor

                                                                                                                                                                      RETURNS DESCRIPTION FloatTensor

                                                                                                                                                                      Shape: (*input_shape, embedding_dim)

                                                                                                                                                                      "},{"location":"reference/edspdf/layers/vocabulary/","title":"edspdf.layers.vocabulary","text":""},{"location":"reference/edspdf/layers/vocabulary/#edspdf.layers.vocabulary.Vocabulary","title":"Vocabulary","text":"

                                                                                                                                                                      Bases: Module, Generic[T]

                                                                                                                                                                      Vocabulary layer. This is not meant to be used as a torch.nn.Module but subclassing torch.nn.Module makes the instances appear when printing a model, which is nice.

                                                                                                                                                                      "},{"location":"reference/edspdf/layers/vocabulary/#edspdf.layers.vocabulary.Vocabulary--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION items

                                                                                                                                                                      Initial vocabulary elements if any. Specific elements such as padding and unk can be set here to enforce their index in the vocabulary.

                                                                                                                                                                      TYPE: Sequence[T] DEFAULT: None

                                                                                                                                                                      default

                                                                                                                                                                      Default index to use for out of vocabulary elements Defaults to -100

                                                                                                                                                                      TYPE: int DEFAULT: -100

                                                                                                                                                                      "},{"location":"reference/edspdf/layers/vocabulary/#edspdf.layers.vocabulary.Vocabulary.initialization","title":"initialization","text":"

                                                                                                                                                                      Enters the initialization mode. Out of vocabulary elements will be assigned an index.

                                                                                                                                                                      "},{"location":"reference/edspdf/layers/vocabulary/#edspdf.layers.vocabulary.Vocabulary.encode","title":"encode","text":"

                                                                                                                                                                      Converts an element into its vocabulary index If the layer is in its initialization mode (with vocab.initialization(): ...), and the element is out of vocabulary, a new index will be created and returned. Otherwise, any oov element will be encoded with the default index.

                                                                                                                                                                      PARAMETER DESCRIPTION item

                                                                                                                                                                      RETURNS DESCRIPTION int"},{"location":"reference/edspdf/layers/vocabulary/#edspdf.layers.vocabulary.Vocabulary.decode","title":"decode","text":"

                                                                                                                                                                      Converts an index into its original value

                                                                                                                                                                      PARAMETER DESCRIPTION idx

                                                                                                                                                                      RETURNS DESCRIPTION InputT"},{"location":"reference/edspdf/pipes/","title":"edspdf.pipes","text":""},{"location":"reference/edspdf/pipes/aggregators/","title":"edspdf.pipes.aggregators","text":""},{"location":"reference/edspdf/pipes/aggregators/simple/","title":"edspdf.pipes.aggregators.simple","text":""},{"location":"reference/edspdf/pipes/aggregators/simple/#edspdf.pipes.aggregators.simple.SimpleAggregator","title":"SimpleAggregator","text":"

                                                                                                                                                                      Aggregator that returns texts and styles. It groups all text boxes with the same label under the aggregated_text, and additionally aggregates the styles of the text boxes.

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/aggregators/simple/#edspdf.pipes.aggregators.simple.SimpleAggregator--examples","title":"Examples","text":"

                                                                                                                                                                      Create a pipeline

                                                                                                                                                                      API-basedConfiguration-based
                                                                                                                                                                      pipeline = ...\npipeline.add_pipe(\n    \"simple-aggregator\",\n    name=\"aggregator\",\n    config={\n        \"new_line_threshold\": 0.2,\n        \"new_paragraph_threshold\": 1.5,\n        \"label_map\": {\n            \"body\": \"text\",\n            \"table\": \"text\",\n        },\n    },\n)\n
                                                                                                                                                                      ...\n\n[components.aggregator]\n@factory = \"simple-aggregator\"\nnew_line_threshold = 0.2\nnew_paragraph_threshold = 1.5\nlabel_map = { body = \"text\", table = \"text\" }\n\n...\n

                                                                                                                                                                      and run it on a document:

                                                                                                                                                                      doc = pipeline(doc)\nprint(doc.aggregated_texts)\n# {\n#     \"text\": \"This is the body of the document, followed by a table | A | B |\"\n# }\n
                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/aggregators/simple/#edspdf.pipes.aggregators.simple.SimpleAggregator--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION pipeline

                                                                                                                                                                      The pipeline object

                                                                                                                                                                      TYPE: Pipeline DEFAULT: None

                                                                                                                                                                      name

                                                                                                                                                                      The name of the component

                                                                                                                                                                      TYPE: str DEFAULT: 'simple-aggregator'

                                                                                                                                                                      sort

                                                                                                                                                                      Whether to sort text boxes inside each label group by (page, y, x) position before merging them.

                                                                                                                                                                      TYPE: bool DEFAULT: False

                                                                                                                                                                      new_line_threshold

                                                                                                                                                                      Minimum ratio of the distance between two lines to the median height of lines to consider them as being on separate lines

                                                                                                                                                                      TYPE: float DEFAULT: 0.2

                                                                                                                                                                      new_paragraph_threshold

                                                                                                                                                                      Minimum ratio of the distance between two lines to the median height of lines to consider them as being on separate paragraphs and thus add a newline character between them.

                                                                                                                                                                      TYPE: float DEFAULT: 1.5

                                                                                                                                                                      label_map

                                                                                                                                                                      A dictionary mapping labels to new labels. This is useful to group labels together, for instance, to output both \"body\" and \"table\" as \"text\".

                                                                                                                                                                      TYPE: Dict DEFAULT: {}

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/classifiers/","title":"edspdf.pipes.classifiers","text":""},{"location":"reference/edspdf/pipes/classifiers/dummy/","title":"edspdf.pipes.classifiers.dummy","text":""},{"location":"reference/edspdf/pipes/classifiers/dummy/#edspdf.pipes.classifiers.dummy.DummyClassifier","title":"DummyClassifier","text":"

                                                                                                                                                                      Dummy classifier, for chaos purposes. Classifies each line to a random element.

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/classifiers/dummy/#edspdf.pipes.classifiers.dummy.DummyClassifier--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION pipeline

                                                                                                                                                                      The pipeline object.

                                                                                                                                                                      TYPE: Pipeline DEFAULT: None

                                                                                                                                                                      name

                                                                                                                                                                      The name of the component.

                                                                                                                                                                      TYPE: str DEFAULT: 'dummy-classifier'

                                                                                                                                                                      label

                                                                                                                                                                      The label to assign to each line.

                                                                                                                                                                      TYPE: str

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/classifiers/mask/","title":"edspdf.pipes.classifiers.mask","text":""},{"location":"reference/edspdf/pipes/classifiers/mask/#edspdf.pipes.classifiers.mask.MaskClassifier","title":"MaskClassifier","text":"

                                                                                                                                                                      Simple mask classifier, that labels every box inside one of the masks with its label.

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/classifiers/mask/#edspdf.pipes.classifiers.mask.simple_mask_classifier_factory","title":"simple_mask_classifier_factory","text":"

                                                                                                                                                                      The simplest form of mask classification. You define the mask, everything else is tagged as pollution.

                                                                                                                                                                      PARAMETER DESCRIPTION pipeline

                                                                                                                                                                      The pipeline object

                                                                                                                                                                      TYPE: Pipeline DEFAULT: None

                                                                                                                                                                      name

                                                                                                                                                                      The name of the component

                                                                                                                                                                      TYPE: str DEFAULT: 'mask-classifier'

                                                                                                                                                                      x0

                                                                                                                                                                      The x0 coordinate of the mask

                                                                                                                                                                      TYPE: float

                                                                                                                                                                      y0

                                                                                                                                                                      The y0 coordinate of the mask

                                                                                                                                                                      TYPE: float

                                                                                                                                                                      x1

                                                                                                                                                                      The x1 coordinate of the mask

                                                                                                                                                                      TYPE: float

                                                                                                                                                                      y1

                                                                                                                                                                      The y1 coordinate of the mask

                                                                                                                                                                      TYPE: float

                                                                                                                                                                      threshold

                                                                                                                                                                      The threshold for the alignment

                                                                                                                                                                      TYPE: float DEFAULT: 1.0

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/classifiers/mask/#edspdf.pipes.classifiers.mask.simple_mask_classifier_factory--examples","title":"Examples","text":"API-basedConfiguration-based
                                                                                                                                                                      pipeline.add_pipe(\n    \"mask-classifier\",\n    name=\"classifier\",\n    config={\n        \"threshold\": 0.9,\n        \"x0\": 0.1,\n        \"y0\": 0.1,\n        \"x1\": 0.9,\n        \"y1\": 0.9,\n    },\n)\n
                                                                                                                                                                      [components.classifier]\n@classifiers = \"mask-classifier\"\nx0 = 0.1\ny0 = 0.1\nx1 = 0.9\ny1 = 0.9\nthreshold = 0.9\n
                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/classifiers/mask/#edspdf.pipes.classifiers.mask.mask_classifier_factory","title":"mask_classifier_factory","text":"

                                                                                                                                                                      A generalisation, wherein the user defines a number of regions.

                                                                                                                                                                      The following configuration produces exactly the same classifier as mask.v1 example above.

                                                                                                                                                                      Any bloc that is not part of a mask is tagged as pollution.

                                                                                                                                                                      PARAMETER DESCRIPTION pipeline

                                                                                                                                                                      The pipeline object

                                                                                                                                                                      TYPE: Pipeline DEFAULT: None

                                                                                                                                                                      name

                                                                                                                                                                      TYPE: str DEFAULT: 'multi-mask-classifier'

                                                                                                                                                                      threshold

                                                                                                                                                                      The threshold for the alignment

                                                                                                                                                                      TYPE: float DEFAULT: 1.0

                                                                                                                                                                      masks

                                                                                                                                                                      The masks

                                                                                                                                                                      TYPE: Box DEFAULT: {}

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/classifiers/mask/#edspdf.pipes.classifiers.mask.mask_classifier_factory--examples","title":"Examples","text":"API-basedConfiguration-based
                                                                                                                                                                      pipeline.add_pipe(\n    \"multi-mask-classifier\",\n    name=\"classifier\",\n    config={\n        \"threshold\": 0.9,\n        \"mymask\": {\"x0\": 0.1, \"y0\": 0.1, \"x1\": 0.9, \"y1\": 0.3, \"label\": \"body\"},\n    },\n)\n
                                                                                                                                                                      [components.classifier]\n@factory = \"multi-mask-classifier\"\nthreshold = 0.9\n\n[components.classifier.mymask]\nlabel = \"body\"\nx0 = 0.1\ny0 = 0.1\nx1 = 0.9\ny1 = 0.9\n

                                                                                                                                                                      The following configuration defines a header region.

                                                                                                                                                                      API-basedConfiguration-based
                                                                                                                                                                      pipeline.add_pipe(\n    \"multi-mask-classifier\",\n    name=\"classifier\",\n    config={\n        \"threshold\": 0.9,\n        \"body\": {\"x0\": 0.1, \"y0\": 0.1, \"x1\": 0.9, \"y1\": 0.3, \"label\": \"header\"},\n        \"header\": {\"x0\": 0.1, \"y0\": 0.3, \"x1\": 0.9, \"y1\": 0.9, \"label\": \"body\"},\n    },\n)\n
                                                                                                                                                                      [components.classifier]\n@factory = \"multi-mask-classifier\"\nthreshold = 0.9\n\n[components.classifier.header]\nlabel = \"header\"\nx0 = 0.1\ny0 = 0.1\nx1 = 0.9\ny1 = 0.3\n\n[components.classifier.body]\nlabel = \"body\"\nx0 = 0.1\ny0 = 0.3\nx1 = 0.9\ny1 = 0.9\n
                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/classifiers/random/","title":"edspdf.pipes.classifiers.random","text":""},{"location":"reference/edspdf/pipes/classifiers/random/#edspdf.pipes.classifiers.random.RandomClassifier","title":"RandomClassifier","text":"

                                                                                                                                                                      Random classifier, for chaos purposes. Classifies each box to a random element.

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/classifiers/random/#edspdf.pipes.classifiers.random.RandomClassifier--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION pipeline

                                                                                                                                                                      The pipeline object.

                                                                                                                                                                      TYPE: Pipeline

                                                                                                                                                                      name

                                                                                                                                                                      The name of the component.

                                                                                                                                                                      TYPE: str DEFAULT: 'random-classifier'

                                                                                                                                                                      labels

                                                                                                                                                                      The labels to assign to each line. If a list is passed, each label is assigned with equal probability. If a dict is passed, the keys are the labels and the values are the probabilities.

                                                                                                                                                                      TYPE: Union[List[str], Dict[str, float]]

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/classifiers/trainable/","title":"edspdf.pipes.classifiers.trainable","text":""},{"location":"reference/edspdf/pipes/classifiers/trainable/#edspdf.pipes.classifiers.trainable.TrainableClassifier","title":"TrainableClassifier","text":"

                                                                                                                                                                      Bases: TrainablePipe[Dict[str, Any]]

                                                                                                                                                                      This component predicts a label for each box over the whole document using machine learning.

                                                                                                                                                                      Note

                                                                                                                                                                      You must train the model your model to use this classifier. See Model training for more information

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/classifiers/trainable/#edspdf.pipes.classifiers.trainable.TrainableClassifier--examples","title":"Examples","text":"

                                                                                                                                                                      The classifier is composed of the following blocks:

                                                                                                                                                                      • a configurable box embedding layer
                                                                                                                                                                      • a linear classification layer

                                                                                                                                                                      In this example, we use a box-embedding layer to generate the embeddings of the boxes. It is composed of a text encoder that embeds the text features of the boxes and a layout encoder that embeds the layout features of the boxes. These two embeddings are summed and passed through an optional contextualizer, here a box-transformer.

                                                                                                                                                                      API-basedConfiguration-based
                                                                                                                                                                      pipeline.add_pipe(\n    \"trainable-classifier\",\n    name=\"classifier\",\n    config={\n        # simple embedding computed by pooling embeddings of words in each box\n        \"embedding\": {\n            \"@factory\": \"sub-box-cnn-pooler\",\n            \"out_channels\": 64,\n            \"kernel_sizes\": (3, 4, 5),\n            \"embedding\": {\n                \"@factory\": \"simple-text-embedding\",\n                \"size\": 72,\n            },\n        },\n        \"labels\": [\"body\", \"pollution\"],\n    },\n)\n
                                                                                                                                                                      [components.classifier]\n@factory = \"trainable-classifier\"\nlabels = [\"body\", \"pollution\"]\n\n[components.classifier.embedding]\n@factory = \"sub-box-cnn-pooler\"\nout_channels = 64\nkernel_sizes = (3, 4, 5)\n\n[components.classifier.embedding.embedding]\n@factory = \"simple-text-embedding\"\nsize = 72\n
                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/classifiers/trainable/#edspdf.pipes.classifiers.trainable.TrainableClassifier--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION labels

                                                                                                                                                                      Initial labels of the classifier (will be completed during initialization)

                                                                                                                                                                      TYPE: Sequence[str] DEFAULT: ('pollution')

                                                                                                                                                                      embedding

                                                                                                                                                                      Embedding module to encode the PDF boxes

                                                                                                                                                                      TYPE: TrainablePipe[EmbeddingOutput]

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/embeddings/","title":"edspdf.pipes.embeddings","text":""},{"location":"reference/edspdf/pipes/embeddings/box_layout_embedding/","title":"edspdf.pipes.embeddings.box_layout_embedding","text":""},{"location":"reference/edspdf/pipes/embeddings/box_layout_embedding/#edspdf.pipes.embeddings.box_layout_embedding.BoxLayoutEmbedding","title":"BoxLayoutEmbedding","text":"

                                                                                                                                                                      Bases: TrainablePipe[EmbeddingOutput]

                                                                                                                                                                      This component encodes the geometrical features of a box, as extracted by the BoxLayoutPreprocessor module, into an embedding. For position modes, use:

                                                                                                                                                                      • \"sin\" to embed positions with a fixed SinusoidalEmbedding
                                                                                                                                                                      • \"learned\" to embed positions using a learned standard pytorch embedding layer

                                                                                                                                                                      Each produces embedding is the concatenation of the box width, height and the top, left, bottom and right coordinates, each embedded depending on the *_mode param.

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/embeddings/box_layout_embedding/#edspdf.pipes.embeddings.box_layout_embedding.BoxLayoutEmbedding--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION size

                                                                                                                                                                      Size of the output box embedding

                                                                                                                                                                      TYPE: int

                                                                                                                                                                      n_positions

                                                                                                                                                                      Number of position embeddings stored in the PositionEmbedding module

                                                                                                                                                                      TYPE: int

                                                                                                                                                                      x_mode

                                                                                                                                                                      Position embedding mode of the x coordinates

                                                                                                                                                                      TYPE: Literal['sin', 'learned'] DEFAULT: 'sin'

                                                                                                                                                                      y_mode

                                                                                                                                                                      Position embedding mode of the x coordinates

                                                                                                                                                                      TYPE: Literal['sin', 'learned'] DEFAULT: 'sin'

                                                                                                                                                                      w_mode

                                                                                                                                                                      Position embedding mode of the width features

                                                                                                                                                                      TYPE: Literal['sin', 'learned'] DEFAULT: 'sin'

                                                                                                                                                                      h_mode

                                                                                                                                                                      Position embedding mode of the height features

                                                                                                                                                                      TYPE: Literal['sin', 'learned'] DEFAULT: 'sin'

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/embeddings/box_layout_preprocessor/","title":"edspdf.pipes.embeddings.box_layout_preprocessor","text":""},{"location":"reference/edspdf/pipes/embeddings/box_layout_preprocessor/#edspdf.pipes.embeddings.box_layout_preprocessor.BoxLayoutPreprocessor","title":"BoxLayoutPreprocessor","text":"

                                                                                                                                                                      Bases: TrainablePipe[BoxLayoutBatch]

                                                                                                                                                                      The box preprocessor is singleton since its is not configurable. The following features of each box of an input PDFDoc document are encoded as 1D tensors:

                                                                                                                                                                      • boxes_page: page index of the box
                                                                                                                                                                      • boxes_first_page: is the box on the first page
                                                                                                                                                                      • boxes_last_page: is the box on the last page
                                                                                                                                                                      • boxes_xmin: left position of the box
                                                                                                                                                                      • boxes_ymin: bottom position of the box
                                                                                                                                                                      • boxes_xmax: right position of the box
                                                                                                                                                                      • boxes_ymax: top position of the box
                                                                                                                                                                      • boxes_w: width position of the box
                                                                                                                                                                      • boxes_h: height position of the box

                                                                                                                                                                      The preprocessor also returns an additional tensors:

                                                                                                                                                                      • page_boxes_id: box indices per page to index the above 1D tensors (LongTensor: n_pages * n_boxes)
                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/embeddings/box_transformer/","title":"edspdf.pipes.embeddings.box_transformer","text":""},{"location":"reference/edspdf/pipes/embeddings/box_transformer/#edspdf.pipes.embeddings.box_transformer.BoxTransformer","title":"BoxTransformer","text":"

                                                                                                                                                                      Bases: TrainablePipe[EmbeddingOutput]

                                                                                                                                                                      BoxTransformer using BoxTransformerModule under the hood.

                                                                                                                                                                      Note

                                                                                                                                                                      This module is a TrainablePipe and can be used in a Pipeline, while BoxTransformerModule is a standard PyTorch module, which does not take care of the preprocessing, collating, etc. of the input documents.

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/embeddings/box_transformer/#edspdf.pipes.embeddings.box_transformer.BoxTransformer--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION pipeline

                                                                                                                                                                      Pipeline instance

                                                                                                                                                                      TYPE: Pipeline DEFAULT: None

                                                                                                                                                                      name

                                                                                                                                                                      Name of the component

                                                                                                                                                                      TYPE: str DEFAULT: 'box-transformer'

                                                                                                                                                                      num_heads

                                                                                                                                                                      Number of attention heads in the attention layers

                                                                                                                                                                      TYPE: int DEFAULT: 2

                                                                                                                                                                      n_relative_positions

                                                                                                                                                                      Maximum range of embeddable relative positions between boxes (further distances are capped to \u00b1n_relative_positions // 2)

                                                                                                                                                                      TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                      dropout_p

                                                                                                                                                                      Dropout probability both for the attention layers and embedding projections

                                                                                                                                                                      TYPE: float DEFAULT: 0.0

                                                                                                                                                                      head_size

                                                                                                                                                                      Head sizes of the attention layers

                                                                                                                                                                      TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                      activation

                                                                                                                                                                      Activation function used in the linear->activation->linear transformations

                                                                                                                                                                      TYPE: ActivationFunction DEFAULT: 'gelu'

                                                                                                                                                                      init_resweight

                                                                                                                                                                      Initial weight of the residual gates. At 0, the layer acts (initially) as an identity function, and at 1 as a standard Transformer layer. Initializing with a value close to 0 can help the training converge.

                                                                                                                                                                      TYPE: float DEFAULT: 0.0

                                                                                                                                                                      attention_mode

                                                                                                                                                                      Mode of relative position infused attention layer. See the relative attention documentation for more information.

                                                                                                                                                                      TYPE: Sequence[Literal['c2c', 'c2p', 'p2c']] DEFAULT: ('c2c', 'c2p', 'p2c')

                                                                                                                                                                      n_layers

                                                                                                                                                                      Number of layers in the Transformer

                                                                                                                                                                      TYPE: int DEFAULT: 2

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/embeddings/embedding_combiner/","title":"edspdf.pipes.embeddings.embedding_combiner","text":""},{"location":"reference/edspdf/pipes/embeddings/embedding_combiner/#edspdf.pipes.embeddings.embedding_combiner.EmbeddingCombiner","title":"EmbeddingCombiner","text":"

                                                                                                                                                                      Bases: TrainablePipe[EmbeddingOutput]

                                                                                                                                                                      Encodes boxes using a combination of multiple encoders

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/embeddings/embedding_combiner/#edspdf.pipes.embeddings.embedding_combiner.EmbeddingCombiner--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION pipeline

                                                                                                                                                                      The pipeline object

                                                                                                                                                                      TYPE: Pipeline DEFAULT: None

                                                                                                                                                                      name

                                                                                                                                                                      The name of the pipe

                                                                                                                                                                      TYPE: str DEFAULT: 'embedding-combiner'

                                                                                                                                                                      mode

                                                                                                                                                                      The mode to use to combine the encoders:

                                                                                                                                                                      • sum: Sum the outputs of the encoders
                                                                                                                                                                      • cat: Concatenate the outputs of the encoders

                                                                                                                                                                      TYPE: Literal['sum', 'cat'] DEFAULT: 'sum'

                                                                                                                                                                      dropout_p

                                                                                                                                                                      Dropout probability used on the output of the box and textual encoders

                                                                                                                                                                      TYPE: float DEFAULT: 0.0

                                                                                                                                                                      encoders

                                                                                                                                                                      The encoders to use. The keys are the names of the encoders and the values are the encoders themselves.

                                                                                                                                                                      TYPE: TrainablePipe[EmbeddingOutput] DEFAULT: {}

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/embeddings/huggingface_embedding/","title":"edspdf.pipes.embeddings.huggingface_embedding","text":""},{"location":"reference/edspdf/pipes/embeddings/huggingface_embedding/#edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding","title":"HuggingfaceEmbedding","text":"

                                                                                                                                                                      Bases: TrainablePipe[EmbeddingOutput]

                                                                                                                                                                      The HuggingfaceEmbeddings component is a wrapper around the Huggingface multi-modal models. Such pre-trained models should offer better results than a model trained from scratch. Compared to using the raw Huggingface model, we offer a simple mechanism to split long documents into strided windows before feeding them to the model.

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/embeddings/huggingface_embedding/#edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding--windowing","title":"Windowing","text":"

                                                                                                                                                                      The HuggingfaceEmbedding component splits long documents into smaller windows before feeding them to the model. This is done to avoid hitting the maximum number of tokens that can be processed by the model on a single device. The window size and stride can be configured using the window and stride parameters. The default values are 510 and 255 respectively, which means that the model will process windows of 510 tokens, each separated by 255 tokens. Whenever a token appears in multiple windows, the embedding of the \"most contextualized\" occurrence is used, i.e. the occurrence that is the closest to the center of its window.

                                                                                                                                                                      Here is an overview how this works in a classifier model :

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/embeddings/huggingface_embedding/#edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding--examples","title":"Examples","text":"

                                                                                                                                                                      Here is an example of how to define a pipeline with the HuggingfaceEmbedding component:

                                                                                                                                                                      from edspdf import Pipeline\n\nmodel = Pipeline()\nmodel.add_pipe(\n    \"pdfminer-extractor\",\n    name=\"extractor\",\n    config={\n        \"render_pages\": True,\n    },\n)\nmodel.add_pipe(\n    \"huggingface-embedding\",\n    name=\"embedding\",\n    config={\n        \"model\": \"microsoft/layoutlmv3-base\",\n        \"use_image\": False,\n        \"window\": 128,\n        \"stride\": 64,\n        \"line_pooling\": \"mean\",\n    },\n)\nmodel.add_pipe(\n    \"trainable-classifier\",\n    name=\"classifier\",\n    config={\n        \"embedding\": model.get_pipe(\"embedding\"),\n        \"labels\": [],\n    },\n)\n

                                                                                                                                                                      This model can then be trained following the training recipe.

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/embeddings/huggingface_embedding/#edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION pipeline

                                                                                                                                                                      The pipeline instance

                                                                                                                                                                      TYPE: Pipeline DEFAULT: None

                                                                                                                                                                      name

                                                                                                                                                                      The component name

                                                                                                                                                                      TYPE: str DEFAULT: 'huggingface-embedding'

                                                                                                                                                                      model

                                                                                                                                                                      The Huggingface model name or path

                                                                                                                                                                      TYPE: str DEFAULT: None

                                                                                                                                                                      use_image

                                                                                                                                                                      Whether to use the image or not in the model

                                                                                                                                                                      TYPE: bool DEFAULT: True

                                                                                                                                                                      window

                                                                                                                                                                      The window size to use when splitting long documents into smaller windows before feeding them to the Transformer model (default: 510 = 512 - 2)

                                                                                                                                                                      TYPE: int DEFAULT: 510

                                                                                                                                                                      stride

                                                                                                                                                                      The stride (distance between windows) to use when splitting long documents into smaller windows: (default: 510 / 2 = 255)

                                                                                                                                                                      TYPE: int DEFAULT: 255

                                                                                                                                                                      line_pooling

                                                                                                                                                                      The pooling strategy to use when combining the embeddings of the tokens in a line into a single line embedding

                                                                                                                                                                      TYPE: Literal['mean', 'max', 'sum'] DEFAULT: 'mean'

                                                                                                                                                                      max_tokens_per_device

                                                                                                                                                                      The maximum number of tokens that can be processed by the model on a single device. This does not affect the results but can be used to reduce the memory usage of the model, at the cost of a longer processing time.

                                                                                                                                                                      TYPE: int DEFAULT: 128 * 128

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/embeddings/simple_text_embedding/","title":"edspdf.pipes.embeddings.simple_text_embedding","text":""},{"location":"reference/edspdf/pipes/embeddings/simple_text_embedding/#edspdf.pipes.embeddings.simple_text_embedding.SimpleTextEmbedding","title":"SimpleTextEmbedding","text":"

                                                                                                                                                                      Bases: TrainablePipe[EmbeddingOutput]

                                                                                                                                                                      A module that embeds the textual features of the blocks

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/embeddings/simple_text_embedding/#edspdf.pipes.embeddings.simple_text_embedding.SimpleTextEmbedding--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION size

                                                                                                                                                                      Size of the output box embedding

                                                                                                                                                                      TYPE: int

                                                                                                                                                                      pipeline

                                                                                                                                                                      The pipeline object

                                                                                                                                                                      TYPE: Pipeline DEFAULT: None

                                                                                                                                                                      name

                                                                                                                                                                      Name of the component

                                                                                                                                                                      TYPE: str DEFAULT: 'simple-text-embedding'

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/embeddings/simple_text_embedding/#edspdf.pipes.embeddings.simple_text_embedding.word_shape","title":"word_shape","text":"

                                                                                                                                                                      Converts a word into its shape following the algorithm used in the spaCy library.

                                                                                                                                                                      https://github.com/explosion/spaCy/blob/b69d249a/spacy/lang/lex_attrs.py#L118

                                                                                                                                                                      PARAMETER DESCRIPTION text

                                                                                                                                                                      TYPE: str

                                                                                                                                                                      RETURNS DESCRIPTION str The word shape"},{"location":"reference/edspdf/pipes/embeddings/sub_box_cnn_pooler/","title":"edspdf.pipes.embeddings.sub_box_cnn_pooler","text":""},{"location":"reference/edspdf/pipes/embeddings/sub_box_cnn_pooler/#edspdf.pipes.embeddings.sub_box_cnn_pooler.SubBoxCNNPooler","title":"SubBoxCNNPooler","text":"

                                                                                                                                                                      Bases: TrainablePipe[EmbeddingOutput]

                                                                                                                                                                      One dimension CNN encoding multi-kernel layer. Input embeddings are convoluted using linear kernels each parametrized with a (window) size of kernel_size[kernel_i] The output of the kernels are concatenated together, max-pooled and finally projected to a size of output_size.

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/embeddings/sub_box_cnn_pooler/#edspdf.pipes.embeddings.sub_box_cnn_pooler.SubBoxCNNPooler--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION pipeline

                                                                                                                                                                      Pipeline instance

                                                                                                                                                                      TYPE: Pipeline DEFAULT: None

                                                                                                                                                                      name

                                                                                                                                                                      Name of the component

                                                                                                                                                                      TYPE: str DEFAULT: 'sub-box-cnn-pooler'

                                                                                                                                                                      output_size

                                                                                                                                                                      Size of the output embeddings Defaults to the input_size

                                                                                                                                                                      TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                      out_channels

                                                                                                                                                                      Number of channels

                                                                                                                                                                      TYPE: Optional[int] DEFAULT: None

                                                                                                                                                                      kernel_sizes

                                                                                                                                                                      Window size of each kernel

                                                                                                                                                                      TYPE: Sequence[int] DEFAULT: (3, 4, 5)

                                                                                                                                                                      activation

                                                                                                                                                                      Activation function to use

                                                                                                                                                                      TYPE: ActivationFunction DEFAULT: 'relu'

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/extractors/","title":"edspdf.pipes.extractors","text":""},{"location":"reference/edspdf/pipes/extractors/pdfminer/","title":"edspdf.pipes.extractors.pdfminer","text":""},{"location":"reference/edspdf/pipes/extractors/pdfminer/#edspdf.pipes.extractors.pdfminer.PdfMinerExtractor","title":"PdfMinerExtractor","text":"

                                                                                                                                                                      We provide a PDF line extractor built on top of PdfMiner.

                                                                                                                                                                      This is the most portable extractor, since it is pure-python and can therefore be run on any platform. Be sure to have a look at their documentation, especially the part providing a bird's eye view of the PDF extraction process.

                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/extractors/pdfminer/#edspdf.pipes.extractors.pdfminer.PdfMinerExtractor--examples","title":"Examples","text":"API-basedConfiguration-based
                                                                                                                                                                      pipeline.add_pipe(\n    \"pdfminer-extractor\",\n    config=dict(\n        extract_style=False,\n    ),\n)\n
                                                                                                                                                                      [components.extractor]\n@factory = \"pdfminer-extractor\"\nextract_style = false\n

                                                                                                                                                                      And use the pipeline on a PDF document:

                                                                                                                                                                      from pathlib import Path\n\n# Apply on a new document\npipeline(Path(\"path/to/your/pdf/document\").read_bytes())\n
                                                                                                                                                                      "},{"location":"reference/edspdf/pipes/extractors/pdfminer/#edspdf.pipes.extractors.pdfminer.PdfMinerExtractor--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION line_overlap

                                                                                                                                                                      See PDFMiner documentation

                                                                                                                                                                      TYPE: float DEFAULT: 0.5

                                                                                                                                                                      char_margin

                                                                                                                                                                      See PDFMiner documentation

                                                                                                                                                                      TYPE: float DEFAULT: 2.05

                                                                                                                                                                      line_margin

                                                                                                                                                                      See PDFMiner documentation

                                                                                                                                                                      TYPE: float DEFAULT: 0.5

                                                                                                                                                                      word_margin

                                                                                                                                                                      See PDFMiner documentation

                                                                                                                                                                      TYPE: float DEFAULT: 0.1

                                                                                                                                                                      boxes_flow

                                                                                                                                                                      See PDFMiner documentation

                                                                                                                                                                      TYPE: Optional[float] DEFAULT: 0.5

                                                                                                                                                                      detect_vertical

                                                                                                                                                                      See PDFMiner documentation

                                                                                                                                                                      TYPE: bool DEFAULT: False

                                                                                                                                                                      all_texts

                                                                                                                                                                      See PDFMiner documentation

                                                                                                                                                                      TYPE: bool DEFAULT: False

                                                                                                                                                                      extract_style

                                                                                                                                                                      Whether to extract style (font, size, ...) information for each line of the document. Default: False

                                                                                                                                                                      TYPE: bool DEFAULT: False

                                                                                                                                                                      render_pages

                                                                                                                                                                      Whether to extract the rendered page as a numpy array in the page.image attribute (defaults to False)

                                                                                                                                                                      TYPE: bool DEFAULT: False

                                                                                                                                                                      render_dpi

                                                                                                                                                                      DPI to use when rendering the page (defaults to 200)

                                                                                                                                                                      TYPE: int DEFAULT: 200

                                                                                                                                                                      raise_on_error

                                                                                                                                                                      Whether to raise an error if the PDF cannot be parsed. Default: False

                                                                                                                                                                      TYPE: bool DEFAULT: False

                                                                                                                                                                      "},{"location":"reference/edspdf/utils/","title":"edspdf.utils","text":""},{"location":"reference/edspdf/utils/alignment/","title":"edspdf.utils.alignment","text":""},{"location":"reference/edspdf/utils/alignment/#edspdf.utils.alignment.align_box_labels","title":"align_box_labels","text":"

                                                                                                                                                                      Align lines with possibly overlapping (and non-exhaustive) labels.

                                                                                                                                                                      Possible matches are sorted by covered area. Lines with no overlap at all

                                                                                                                                                                      PARAMETER DESCRIPTION src_boxes

                                                                                                                                                                      The labelled boxes that will be used to determine the label of the dst_boxes

                                                                                                                                                                      TYPE: Sequence[Box]

                                                                                                                                                                      dst_boxes

                                                                                                                                                                      The non-labelled boxes that will be assigned a label

                                                                                                                                                                      TYPE: Sequence[T]

                                                                                                                                                                      threshold

                                                                                                                                                                      Threshold to use for discounting a label. Used if the labels DataFrame does not provide a threshold column, or to fill NaN values thereof.

                                                                                                                                                                      TYPE: float DEFAULT: 1

                                                                                                                                                                      pollution_label

                                                                                                                                                                      The label to use for boxes that are not covered by any of the source boxes

                                                                                                                                                                      TYPE: Any DEFAULT: None

                                                                                                                                                                      RETURNS DESCRIPTION List[Box]

                                                                                                                                                                      A copy of the boxes, with the labels mapped from the source boxes

                                                                                                                                                                      "},{"location":"reference/edspdf/utils/collections/","title":"edspdf.utils.collections","text":""},{"location":"reference/edspdf/utils/collections/#edspdf.utils.collections.multi_tee","title":"multi_tee","text":"

                                                                                                                                                                      Makes copies of an iterable such that every iteration over it starts from 0. If the iterable is a sequence (list, tuple), just returns it since every iter() over the object restart from the beginning

                                                                                                                                                                      "},{"location":"reference/edspdf/utils/collections/#edspdf.utils.collections.FrozenDict","title":"FrozenDict","text":"

                                                                                                                                                                      Bases: dict

                                                                                                                                                                      Copied from spacy.util.SimpleFrozenDict to ensure compatibility.

                                                                                                                                                                      Initialize the frozen dict. Can be initialized with pre-defined values.

                                                                                                                                                                      error (str): The error message when user tries to assign to dict.

                                                                                                                                                                      "},{"location":"reference/edspdf/utils/collections/#edspdf.utils.collections.FrozenList","title":"FrozenList","text":"

                                                                                                                                                                      Bases: list

                                                                                                                                                                      Copied from spacy.util.SimpleFrozenDict to ensure compatibility

                                                                                                                                                                      Initialize the frozen list.

                                                                                                                                                                      error (str): The error message when user tries to mutate the list.

                                                                                                                                                                      "},{"location":"reference/edspdf/utils/optimization/","title":"edspdf.utils.optimization","text":""},{"location":"reference/edspdf/utils/package/","title":"edspdf.utils.package","text":""},{"location":"reference/edspdf/utils/package/#edspdf.utils.package.PoetryPackager","title":"PoetryPackager","text":""},{"location":"reference/edspdf/utils/package/#edspdf.utils.package.PoetryPackager.ensure_pyproject","title":"ensure_pyproject","text":"

                                                                                                                                                                      Generates a Poetry based pyproject.toml

                                                                                                                                                                      "},{"location":"reference/edspdf/utils/random/","title":"edspdf.utils.random","text":""},{"location":"reference/edspdf/utils/random/#edspdf.utils.random.set_seed","title":"set_seed","text":"

                                                                                                                                                                      Set seed values for random generators. If used as a context, restore the random state used before entering the context.

                                                                                                                                                                      "},{"location":"reference/edspdf/utils/random/#edspdf.utils.random.set_seed--parameters","title":"Parameters","text":"PARAMETER DESCRIPTION seed

                                                                                                                                                                      Value used as a seed.

                                                                                                                                                                      cuda

                                                                                                                                                                      Saves the cuda random states too

                                                                                                                                                                      DEFAULT: is_available()

                                                                                                                                                                      "},{"location":"reference/edspdf/utils/random/#edspdf.utils.random.get_random_generator_state","title":"get_random_generator_state","text":"

                                                                                                                                                                      Get the torch, numpy and random random generator state.

                                                                                                                                                                      PARAMETER DESCRIPTION cuda

                                                                                                                                                                      Saves the cuda random states too

                                                                                                                                                                      DEFAULT: is_available()

                                                                                                                                                                      RETURNS DESCRIPTION RandomGeneratorState"},{"location":"reference/edspdf/utils/random/#edspdf.utils.random.set_random_generator_state","title":"set_random_generator_state","text":"

                                                                                                                                                                      Set the torch, numpy and random random generator state.

                                                                                                                                                                      PARAMETER DESCRIPTION state

                                                                                                                                                                      "},{"location":"reference/edspdf/utils/torch/","title":"edspdf.utils.torch","text":""},{"location":"reference/edspdf/utils/torch/#edspdf.utils.torch.compute_pdf_relative_positions","title":"compute_pdf_relative_positions","text":"

                                                                                                                                                                      Compute relative positions between boxes. Input boxes must be split between pages with the shape n_pages * n_boxes

                                                                                                                                                                      PARAMETER DESCRIPTION x0

                                                                                                                                                                      y0

                                                                                                                                                                      x1

                                                                                                                                                                      y1

                                                                                                                                                                      width

                                                                                                                                                                      height

                                                                                                                                                                      n_relative_positions

                                                                                                                                                                      Maximum range of embeddable relative positions between boxes (further distances will be capped to \u00b1n_relative_positions // 2)

                                                                                                                                                                      RETURNS DESCRIPTION LongTensor

                                                                                                                                                                      Shape: n_pages * n_boxes * n_boxes * 2

                                                                                                                                                                      "},{"location":"reference/edspdf/visualization/","title":"edspdf.visualization","text":""},{"location":"reference/edspdf/visualization/annotations/","title":"edspdf.visualization.annotations","text":""},{"location":"reference/edspdf/visualization/annotations/#edspdf.visualization.annotations.show_annotations","title":"show_annotations","text":"

                                                                                                                                                                      Show Box annotations on a PDF document.

                                                                                                                                                                      PARAMETER DESCRIPTION pdf

                                                                                                                                                                      Bytes content of the PDF document

                                                                                                                                                                      TYPE: bytes

                                                                                                                                                                      annotations

                                                                                                                                                                      List of Box annotations to show

                                                                                                                                                                      TYPE: Sequence[Box]

                                                                                                                                                                      colors

                                                                                                                                                                      Colors to use for each label. If a list is provided, it will be used to color the first len(colors) unique labels. If a dictionary is provided, it will be used to color the labels in the dictionary. If None, a default color scheme will be used.

                                                                                                                                                                      TYPE: Optional[Union[Dict[str, str], List[str]]] DEFAULT: None

                                                                                                                                                                      RETURNS DESCRIPTION List[PpmImageFile]

                                                                                                                                                                      List of PIL images with the annotations. You can display them in a notebook with display(*pages).

                                                                                                                                                                      "},{"location":"reference/edspdf/visualization/annotations/#edspdf.visualization.annotations.compare_results","title":"compare_results","text":"

                                                                                                                                                                      Compare two sets of annotations on a PDF document.

                                                                                                                                                                      PARAMETER DESCRIPTION pdf

                                                                                                                                                                      Bytes content of the PDF document

                                                                                                                                                                      TYPE: bytes

                                                                                                                                                                      pred

                                                                                                                                                                      List of Box annotations to show on the left side

                                                                                                                                                                      TYPE: Sequence[Box]

                                                                                                                                                                      gold

                                                                                                                                                                      List of Box annotations to show on the right side

                                                                                                                                                                      TYPE: Sequence[Box]

                                                                                                                                                                      colors

                                                                                                                                                                      Colors to use for each label. If a list is provided, it will be used to color the first len(colors) unique labels. If a dictionary is provided, it will be used to color the labels in the dictionary. If None, a default color scheme will be used.

                                                                                                                                                                      TYPE: Optional[Union[Dict[str, str], List[str]]] DEFAULT: None

                                                                                                                                                                      RETURNS DESCRIPTION List[PpmImageFile]

                                                                                                                                                                      List of PIL images with the annotations. You can display them in a notebook with display(*pages).

                                                                                                                                                                      "},{"location":"reference/edspdf/visualization/merge/","title":"edspdf.visualization.merge","text":""},{"location":"reference/edspdf/visualization/merge/#edspdf.visualization.merge.merge_boxes","title":"merge_boxes","text":"

                                                                                                                                                                      Recursively merge boxes that have the same label to form larger non-overlapping boxes.

                                                                                                                                                                      PARAMETER DESCRIPTION boxes

                                                                                                                                                                      List of boxes to merge

                                                                                                                                                                      TYPE: Sequence[Box]

                                                                                                                                                                      RETURNS DESCRIPTION List[Box]

                                                                                                                                                                      List of merged boxes

                                                                                                                                                                      "},{"location":"utilities/","title":"Overview","text":"

                                                                                                                                                                      EDS-PDF provides a few utilities help annotate PDF documents, and debug the output of an extraction pipeline.

                                                                                                                                                                      "},{"location":"utilities/alignment/","title":"Alignment","text":"

                                                                                                                                                                      To simplify the annotation process, EDS-PDF provides a utility that aligns bounding boxes with text blocs extracted from a PDF document. This is particularly useful for annotating documents.

                                                                                                                                                                      BlocsBlocs + AnnotationAlignedMerged Blocs

                                                                                                                                                                      "},{"location":"utilities/visualisation/","title":"Visualisation","text":"

                                                                                                                                                                      EDS-PDF provides utilities to help you visualise the output of the pipeline.

                                                                                                                                                                      "},{"location":"utilities/visualisation/#visualising-a-pipelines-output","title":"Visualising a pipeline's output","text":"

                                                                                                                                                                      You can use EDS-PDF to overlay labelled bounding boxes on top of a PDF document.

                                                                                                                                                                      import edspdf\nfrom confit import Config\nfrom pathlib import Path\nfrom edspdf.visualization import show_annotations\n\nconfig = \"\"\"\n[pipeline]\npipeline = [\"extractor\", \"classifier\"]\n\n[components]\n\n[components.extractor]\n@factory = \"pdfminer-extractor\"\nextract_style = true\n\n[components.classifier]\n@factory = \"mask-classifier\"\nx0 = 0.25\nx1 = 0.95\ny0 = 0.3\ny1 = 0.9\nthreshold = 0.1\n\"\"\"\n\nmodel = edspdf.load(Config.from_str(config))\n\n# Get a PDF\npdf = Path(\"/Users/perceval/Development/edspdf/tests/resources/letter.pdf\").read_bytes()\n\n# Construct the DataFrame of blocs\ndoc = model(pdf)\n\n# Compute an image representation of each page of the PDF\n# overlaid with the predicted bounding boxes\nimgs = show_annotations(pdf=pdf, annotations=doc.text_boxes)\n\nimgs[0]\n

                                                                                                                                                                      If you run this code in a Jupyter notebook, you'll see the following:

                                                                                                                                                                      "},{"location":"utilities/visualisation/#merging-blocs-together","title":"Merging blocs together","text":"

                                                                                                                                                                      To help debug a pipeline (or a labelled dataset), you might want to merge blocs together according to their labels. EDS-PDF provides a merge_lines method that does just that.

                                                                                                                                                                      # \u2191 Omitted code above \u2191\nfrom edspdf.visualization import merge_boxes, show_annotations\n\nmerged = merge_boxes(doc.text_boxes)\n\nimgs = show_annotations(pdf=pdf, annotations=merged)\nimgs[0]\n

                                                                                                                                                                      See the difference:

                                                                                                                                                                      OriginalMerged

                                                                                                                                                                      The merge_boxes method uses the notion of maximal cliques to compute merges. It forbids the combined blocs from overlapping with any bloc from another label.

                                                                                                                                                                      "}]} \ No newline at end of file diff --git a/main/sitemap.xml b/main/sitemap.xml new file mode 100644 index 00000000..0f8724ef --- /dev/null +++ b/main/sitemap.xml @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/main/sitemap.xml.gz b/main/sitemap.xml.gz new file mode 100644 index 00000000..ecd60bdc Binary files /dev/null and b/main/sitemap.xml.gz differ diff --git a/main/trainable-pipes/index.html b/main/trainable-pipes/index.html new file mode 100644 index 00000000..1530b6d1 --- /dev/null +++ b/main/trainable-pipes/index.html @@ -0,0 +1,2951 @@ + + + + + + + + + + + + + + + + + + + + + + Trainable pipes - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                                                      + +
                                                                                                                                                                      + + + + + + + + +
                                                                                                                                                                      + + +
                                                                                                                                                                      + +
                                                                                                                                                                      + + + + + + +
                                                                                                                                                                      +
                                                                                                                                                                      + + + +
                                                                                                                                                                      +
                                                                                                                                                                      +
                                                                                                                                                                      + + + + +
                                                                                                                                                                      +
                                                                                                                                                                      +
                                                                                                                                                                      + + + +
                                                                                                                                                                      +
                                                                                                                                                                      +
                                                                                                                                                                      + + + +
                                                                                                                                                                      +
                                                                                                                                                                      +
                                                                                                                                                                      + + + +
                                                                                                                                                                      +
                                                                                                                                                                      + + + + + + + +

                                                                                                                                                                      Trainable pipes

                                                                                                                                                                      +

                                                                                                                                                                      Trainable pipes allow for deep learning operations to be performed on the PDFDoc object and must be trained to be used. +Such pipes can be used to train a model to predict the label of the lines extracted from a PDF document.

                                                                                                                                                                      +

                                                                                                                                                                      Anatomy of a trainable pipe

                                                                                                                                                                      +

                                                                                                                                                                      Building and running deep learning models usually requires preprocessing the input sample into features, batching or "collating" these features together to process multiple samples at once, running deep learning operations over these features (in Pytorch, this step is done in the forward method) and postprocessing the outputs of these operation to complete the original sample.

                                                                                                                                                                      +

                                                                                                                                                                      In the trainable pipes of EDS-PDF, preprocessing and postprocessing are decoupled from the deep learning code but collocated with the forward method. This is achieved by splitting the class of a trainable component into four methods, which allows us to keep the development of new deep-learning components simple while ensuring efficient models both during training and inference.

                                                                                                                                                                      +

                                                                                                                                                                      preprocess

                                                                                                                                                                      + + +
                                                                                                                                                                      + + + +
                                                                                                                                                                      + +

                                                                                                                                                                      Preprocess the document to extract features that will be used by the +neural network to perform its predictions.

                                                                                                                                                                      + + + + + + + + + + + + + + +
                                                                                                                                                                      PARAMETERDESCRIPTION
                                                                                                                                                                      doc +

                                                                                                                                                                      PDFDocument to preprocess

                                                                                                                                                                      +

                                                                                                                                                                      + + TYPE: + PDFDoc + +

                                                                                                                                                                      +
                                                                                                                                                                      + + + + + + + + + + + + + + + + +
                                                                                                                                                                      RETURNSDESCRIPTION
                                                                                                                                                                      + + Dict[str, Any] + + +
                                                                                                                                                                      +

                                                                                                                                                                      Dictionary (optionally nested) containing the features extracted from +the document.

                                                                                                                                                                      +
                                                                                                                                                                      +
                                                                                                                                                                      + +
                                                                                                                                                                      + +

                                                                                                                                                                      collate

                                                                                                                                                                      + + +
                                                                                                                                                                      + + + +
                                                                                                                                                                      + +

                                                                                                                                                                      Collate the batch of features into a single batch of tensors that can be +used by the forward method of the component.

                                                                                                                                                                      + + + + + + + + + + + + + + + + + + +
                                                                                                                                                                      PARAMETERDESCRIPTION
                                                                                                                                                                      batch +

                                                                                                                                                                      Batch of features

                                                                                                                                                                      +

                                                                                                                                                                      + + TYPE: + NestedSequences + +

                                                                                                                                                                      +
                                                                                                                                                                      device +

                                                                                                                                                                      Device on which the tensors should be moved

                                                                                                                                                                      +

                                                                                                                                                                      + + TYPE: + device + +

                                                                                                                                                                      +
                                                                                                                                                                      + + + + + + + + + + + + + + + + +
                                                                                                                                                                      RETURNSDESCRIPTION
                                                                                                                                                                      + + InputBatch + + +
                                                                                                                                                                      +

                                                                                                                                                                      Dictionary (optionally nested) containing the collated tensors

                                                                                                                                                                      +
                                                                                                                                                                      +
                                                                                                                                                                      + +
                                                                                                                                                                      + +

                                                                                                                                                                      forward

                                                                                                                                                                      + + +
                                                                                                                                                                      + + + +
                                                                                                                                                                      + +

                                                                                                                                                                      Perform the forward pass of the neural network, i.e, apply transformations +over the collated features to compute new embeddings, probabilities, losses, etc

                                                                                                                                                                      + + + + + + + + + + + + + + +
                                                                                                                                                                      PARAMETERDESCRIPTION
                                                                                                                                                                      batch +

                                                                                                                                                                      Batch of tensors (nested dictionary) computed by the collate method

                                                                                                                                                                      +

                                                                                                                                                                      + + TYPE: + InputBatch + +

                                                                                                                                                                      +
                                                                                                                                                                      + + + + + + + + + + + + + + + + +
                                                                                                                                                                      RETURNSDESCRIPTION
                                                                                                                                                                      + + OutputBatch + + +
                                                                                                                                                                      + +
                                                                                                                                                                      +
                                                                                                                                                                      + +
                                                                                                                                                                      + +

                                                                                                                                                                      postprocess

                                                                                                                                                                      + + +
                                                                                                                                                                      + + + +
                                                                                                                                                                      + +

                                                                                                                                                                      Update the documents with the predictions of the neural network, for instance +converting label probabilities into label attributes on the document lines.

                                                                                                                                                                      +

                                                                                                                                                                      By default, this is a no-op.

                                                                                                                                                                      + + + + + + + + + + + + + + + + + + +
                                                                                                                                                                      PARAMETERDESCRIPTION
                                                                                                                                                                      docs +

                                                                                                                                                                      Batch of documents

                                                                                                                                                                      +

                                                                                                                                                                      + + TYPE: + Sequence[PDFDoc] + +

                                                                                                                                                                      +
                                                                                                                                                                      batch +

                                                                                                                                                                      Batch of predictions, as returned by the forward method

                                                                                                                                                                      +

                                                                                                                                                                      + + TYPE: + OutputBatch + +

                                                                                                                                                                      +
                                                                                                                                                                      + + + + + + + + + + + + + + + + +
                                                                                                                                                                      RETURNSDESCRIPTION
                                                                                                                                                                      + + Sequence[PDFDoc] + + +
                                                                                                                                                                      + +
                                                                                                                                                                      +
                                                                                                                                                                      + +
                                                                                                                                                                      + +

                                                                                                                                                                      Additionally, there is a fifth method:

                                                                                                                                                                      +

                                                                                                                                                                      post_init

                                                                                                                                                                      + + +
                                                                                                                                                                      + + + +
                                                                                                                                                                      + +

                                                                                                                                                                      This method completes the attributes of the component, by looking at some +documents. It is especially useful to build vocabularies or detect the labels +of a classification task.

                                                                                                                                                                      + + + + + + + + + + + + + + + + + + +
                                                                                                                                                                      PARAMETERDESCRIPTION
                                                                                                                                                                      gold_data +

                                                                                                                                                                      The documents to use for initialization.

                                                                                                                                                                      +

                                                                                                                                                                      + + TYPE: + Iterable[PDFDoc] + +

                                                                                                                                                                      +
                                                                                                                                                                      exclude +

                                                                                                                                                                      The names of components to exclude from initialization. +This argument will be gradually updated with the names of initialized +components

                                                                                                                                                                      +

                                                                                                                                                                      + + TYPE: + set + +

                                                                                                                                                                      +
                                                                                                                                                                      + +
                                                                                                                                                                      + +

                                                                                                                                                                      Implementing a trainable component

                                                                                                                                                                      +

                                                                                                                                                                      Here is an example of a trainable component:

                                                                                                                                                                      +
                                                                                                                                                                      from typing import Any, Dict, Iterable, Sequence
                                                                                                                                                                      +
                                                                                                                                                                      +import torch
                                                                                                                                                                      +from tqdm import tqdm
                                                                                                                                                                      +
                                                                                                                                                                      +from edspdf import Pipeline, TrainablePipe, registry
                                                                                                                                                                      +from edspdf.structures import PDFDoc
                                                                                                                                                                      +
                                                                                                                                                                      +
                                                                                                                                                                      +@registry.factory.register("my-component")
                                                                                                                                                                      +class MyComponent(TrainablePipe):
                                                                                                                                                                      +    def __init__(
                                                                                                                                                                      +        self,
                                                                                                                                                                      +        # A subcomponent
                                                                                                                                                                      +        pipeline: Pipeline,
                                                                                                                                                                      +        name: str,
                                                                                                                                                                      +        embedding: TrainablePipe,
                                                                                                                                                                      +    ):
                                                                                                                                                                      +        super().__init__(pipeline=pipeline, name=name)
                                                                                                                                                                      +        self.embedding = embedding
                                                                                                                                                                      +
                                                                                                                                                                      +    def post_init(self, gold_data: Iterable[PDFDoc], exclude: set):
                                                                                                                                                                      +        # Initialize the component with the gold documents
                                                                                                                                                                      +        with self.label_vocabulary.initialization():
                                                                                                                                                                      +            for doc in tqdm(gold_data, desc="Initializing the component"):
                                                                                                                                                                      +                # Do something like learning a vocabulary over the initialization
                                                                                                                                                                      +                # documents
                                                                                                                                                                      +                ...
                                                                                                                                                                      +
                                                                                                                                                                      +        # And post_init the subcomponent
                                                                                                                                                                      +        exclude.add(self.name)
                                                                                                                                                                      +        self.embedding.post_init(gold_data, exclude)
                                                                                                                                                                      +
                                                                                                                                                                      +        # Initialize any layer that might be missing from the module
                                                                                                                                                                      +        self.classifier = torch.nn.Linear(...)
                                                                                                                                                                      +
                                                                                                                                                                      +    def preprocess(self, doc: PDFDoc, supervision: bool = False) -> Dict[str, Any]:
                                                                                                                                                                      +        # Preprocess the doc to extract features required to run the embedding
                                                                                                                                                                      +        # subcomponent, and this component
                                                                                                                                                                      +        return {
                                                                                                                                                                      +            "embedding": self.embedding.preprocess_supervised(doc),
                                                                                                                                                                      +            "my-feature": ...(doc),
                                                                                                                                                                      +        }
                                                                                                                                                                      +
                                                                                                                                                                      +    def collate(self, batch, device: torch.device) -> Dict:
                                                                                                                                                                      +        # Collate the features of the "embedding" subcomponent
                                                                                                                                                                      +        # and the features of this component as well
                                                                                                                                                                      +        return {
                                                                                                                                                                      +            "embedding": self.embedding.collate(batch["embedding"], device),
                                                                                                                                                                      +            "my-feature": torch.as_tensor(batch["my-feature"], device=device),
                                                                                                                                                                      +        }
                                                                                                                                                                      +
                                                                                                                                                                      +    def forward(self, batch: Dict, supervision=False) -> Dict:
                                                                                                                                                                      +        # Call the embedding subcomponent
                                                                                                                                                                      +        embeds = self.embedding(batch["embedding"])
                                                                                                                                                                      +
                                                                                                                                                                      +        # Do something with the embedding tensors
                                                                                                                                                                      +        output = ...(embeds)
                                                                                                                                                                      +
                                                                                                                                                                      +        return output
                                                                                                                                                                      +
                                                                                                                                                                      +    def postprocess(self, docs: Sequence[PDFDoc], output: Dict) -> Sequence[PDFDoc]:
                                                                                                                                                                      +        # Annotate the docs with the outputs of the forward method
                                                                                                                                                                      +        ...
                                                                                                                                                                      +        return docs
                                                                                                                                                                      +
                                                                                                                                                                      +

                                                                                                                                                                      Nesting trainable pipes

                                                                                                                                                                      +

                                                                                                                                                                      Like pytorch modules, you can compose trainable pipes together to build complex architectures. For instance, a trainable classifier component may delegate some of its logic to an embedding component, which will only be responsible for converting PDF lines into multidimensional arrays of numbers.

                                                                                                                                                                      +

                                                                                                                                                                      Nesting pipes allows switching parts of the neural networks to test various architectures and keeping the modelling logic modular.

                                                                                                                                                                      +

                                                                                                                                                                      Sharing subcomponents

                                                                                                                                                                      +

                                                                                                                                                                      Sharing parts of a neural network while training on different tasks can be an effective way to improve the network efficiency. For instance, it is common to share an embedding layer between multiple tasks that require embedding the same inputs.

                                                                                                                                                                      +

                                                                                                                                                                      In EDS-PDF, sharing a subcomponent is simply done by sharing the object between the multiple pipes. You can either refer to an existing subcomponent when configuring a new component in Python, or use the interpolation mechanism of our configuration system.

                                                                                                                                                                      +
                                                                                                                                                                      +
                                                                                                                                                                      +
                                                                                                                                                                      +
                                                                                                                                                                      pipeline.add_pipe(
                                                                                                                                                                      +    "my-component-1",
                                                                                                                                                                      +    name="first",
                                                                                                                                                                      +    config={
                                                                                                                                                                      +        "embedding": {
                                                                                                                                                                      +            "@factory": "box-embedding",
                                                                                                                                                                      +            # ...
                                                                                                                                                                      +        }
                                                                                                                                                                      +    },
                                                                                                                                                                      +)
                                                                                                                                                                      +pipeline.add_pipe(
                                                                                                                                                                      +    "my-component-2",
                                                                                                                                                                      +    name="second",
                                                                                                                                                                      +    config={
                                                                                                                                                                      +        "embedding": pipeline.components.first.embedding,
                                                                                                                                                                      +    },
                                                                                                                                                                      +)
                                                                                                                                                                      +
                                                                                                                                                                      +
                                                                                                                                                                      +
                                                                                                                                                                      +
                                                                                                                                                                      [components.first]
                                                                                                                                                                      +@factory = "my-component-1"
                                                                                                                                                                      +
                                                                                                                                                                      +[components.first.embedding]
                                                                                                                                                                      +@factory = "box-embedding"
                                                                                                                                                                      +...
                                                                                                                                                                      +
                                                                                                                                                                      +[components.second]
                                                                                                                                                                      +@factory = "my-component-2"
                                                                                                                                                                      +embedding = ${components.first.embedding}
                                                                                                                                                                      +
                                                                                                                                                                      +
                                                                                                                                                                      +
                                                                                                                                                                      +
                                                                                                                                                                      +

                                                                                                                                                                      To avoid recomputing the preprocess / forward and collate in the multiple components that use it, we rely on a light cache system.

                                                                                                                                                                      +

                                                                                                                                                                      During the training loop, when computing the loss for each component, the forward calls must be wrapped by the pipeline.cache() context to enable this caching mechanism between components.

                                                                                                                                                                      +

                                                                                                                                                                        + + + + + + +
                                                                                                                                                                        +
                                                                                                                                                                        + + +
                                                                                                                                                                        + +
                                                                                                                                                                        + + + +
                                                                                                                                                                        +
                                                                                                                                                                        +
                                                                                                                                                                        +
                                                                                                                                                                        + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/utilities/alignment/index.html b/main/utilities/alignment/index.html new file mode 100644 index 00000000..af3a4c45 --- /dev/null +++ b/main/utilities/alignment/index.html @@ -0,0 +1,2357 @@ + + + + + + + + + + + + + + + + + + + + + + Alignment - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                                                        + +
                                                                                                                                                                        + + + + + + + + +
                                                                                                                                                                        + + +
                                                                                                                                                                        + +
                                                                                                                                                                        + + + + + + +
                                                                                                                                                                        +
                                                                                                                                                                        + + + +
                                                                                                                                                                        +
                                                                                                                                                                        +
                                                                                                                                                                        + + + + +
                                                                                                                                                                        +
                                                                                                                                                                        +
                                                                                                                                                                        + + + +
                                                                                                                                                                        +
                                                                                                                                                                        +
                                                                                                                                                                        + + + +
                                                                                                                                                                        +
                                                                                                                                                                        +
                                                                                                                                                                        + + + +
                                                                                                                                                                        +
                                                                                                                                                                        + + + + + + + +

                                                                                                                                                                        Alignment

                                                                                                                                                                        +

                                                                                                                                                                        To simplify the annotation process, EDS-PDF provides a utility that aligns +bounding boxes with text blocs extracted from a PDF document. +This is particularly useful for annotating documents.

                                                                                                                                                                        +
                                                                                                                                                                        +
                                                                                                                                                                        +
                                                                                                                                                                        +

                                                                                                                                                                        blocs

                                                                                                                                                                        +
                                                                                                                                                                        +
                                                                                                                                                                        +

                                                                                                                                                                        blocs + annotation

                                                                                                                                                                        +
                                                                                                                                                                        +
                                                                                                                                                                        +

                                                                                                                                                                        aligned

                                                                                                                                                                        +
                                                                                                                                                                        +
                                                                                                                                                                        +

                                                                                                                                                                        resources

                                                                                                                                                                        +
                                                                                                                                                                        +
                                                                                                                                                                        +
                                                                                                                                                                        +

                                                                                                                                                                          + + + + + + +
                                                                                                                                                                          +
                                                                                                                                                                          + + +
                                                                                                                                                                          + +
                                                                                                                                                                          + + + +
                                                                                                                                                                          +
                                                                                                                                                                          +
                                                                                                                                                                          +
                                                                                                                                                                          + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/utilities/index.html b/main/utilities/index.html new file mode 100644 index 00000000..3731fd37 --- /dev/null +++ b/main/utilities/index.html @@ -0,0 +1,2329 @@ + + + + + + + + + + + + + + + + + + + + + + Overview - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                                                          + +
                                                                                                                                                                          + + + + + + + + +
                                                                                                                                                                          + + +
                                                                                                                                                                          + +
                                                                                                                                                                          + + + + + + +
                                                                                                                                                                          +
                                                                                                                                                                          + + + +
                                                                                                                                                                          +
                                                                                                                                                                          +
                                                                                                                                                                          + + + + +
                                                                                                                                                                          +
                                                                                                                                                                          +
                                                                                                                                                                          + + + +
                                                                                                                                                                          +
                                                                                                                                                                          +
                                                                                                                                                                          + + + +
                                                                                                                                                                          +
                                                                                                                                                                          +
                                                                                                                                                                          + + + +
                                                                                                                                                                          +
                                                                                                                                                                          + + + + + + + +

                                                                                                                                                                          Overview

                                                                                                                                                                          +

                                                                                                                                                                          EDS-PDF provides a few utilities help annotate PDF documents, and debug the output of an extraction pipeline.

                                                                                                                                                                          +

                                                                                                                                                                            + + + + + + +
                                                                                                                                                                            +
                                                                                                                                                                            + + +
                                                                                                                                                                            + +
                                                                                                                                                                            + + + +
                                                                                                                                                                            +
                                                                                                                                                                            +
                                                                                                                                                                            +
                                                                                                                                                                            + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/main/utilities/resources/aligned-merged.jpeg b/main/utilities/resources/aligned-merged.jpeg new file mode 100644 index 00000000..ee94c0e8 Binary files /dev/null and b/main/utilities/resources/aligned-merged.jpeg differ diff --git a/main/utilities/resources/aligned.jpeg b/main/utilities/resources/aligned.jpeg new file mode 100644 index 00000000..3c373bb9 Binary files /dev/null and b/main/utilities/resources/aligned.jpeg differ diff --git a/main/utilities/resources/blocs.jpeg b/main/utilities/resources/blocs.jpeg new file mode 100644 index 00000000..f2ba6c25 Binary files /dev/null and b/main/utilities/resources/blocs.jpeg differ diff --git a/main/utilities/resources/blocs.png b/main/utilities/resources/blocs.png new file mode 100644 index 00000000..9e735fff Binary files /dev/null and b/main/utilities/resources/blocs.png differ diff --git a/main/utilities/resources/lines.jpeg b/main/utilities/resources/lines.jpeg new file mode 100644 index 00000000..b3afb26b Binary files /dev/null and b/main/utilities/resources/lines.jpeg differ diff --git a/main/utilities/resources/merged.jpeg b/main/utilities/resources/merged.jpeg new file mode 100644 index 00000000..c6d767d1 Binary files /dev/null and b/main/utilities/resources/merged.jpeg differ diff --git a/main/utilities/visualisation/index.html b/main/utilities/visualisation/index.html new file mode 100644 index 00000000..6f722645 --- /dev/null +++ b/main/utilities/visualisation/index.html @@ -0,0 +1,2466 @@ + + + + + + + + + + + + + + + + + + + + + + Visualisation - EDS-PDF + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                                                                                                                                                            + +
                                                                                                                                                                            + + + + + + + + +
                                                                                                                                                                            + + +
                                                                                                                                                                            + +
                                                                                                                                                                            + + + + + + +
                                                                                                                                                                            +
                                                                                                                                                                            + + + +
                                                                                                                                                                            +
                                                                                                                                                                            +
                                                                                                                                                                            + + + + +
                                                                                                                                                                            +
                                                                                                                                                                            +
                                                                                                                                                                            + + + +
                                                                                                                                                                            +
                                                                                                                                                                            +
                                                                                                                                                                            + + + +
                                                                                                                                                                            +
                                                                                                                                                                            +
                                                                                                                                                                            + + + +
                                                                                                                                                                            +
                                                                                                                                                                            + + + + + + + +

                                                                                                                                                                            Visualisation

                                                                                                                                                                            +

                                                                                                                                                                            EDS-PDF provides utilities to help you visualise the output of the pipeline.

                                                                                                                                                                            +

                                                                                                                                                                            Visualising a pipeline's output

                                                                                                                                                                            +

                                                                                                                                                                            You can use EDS-PDF to overlay labelled bounding boxes on top of a PDF document.

                                                                                                                                                                            +
                                                                                                                                                                            import edspdf
                                                                                                                                                                            +from confit import Config
                                                                                                                                                                            +from pathlib import Path
                                                                                                                                                                            +from edspdf.visualization import show_annotations
                                                                                                                                                                            +
                                                                                                                                                                            +config = """
                                                                                                                                                                            +[pipeline]
                                                                                                                                                                            +pipeline = ["extractor", "classifier"]
                                                                                                                                                                            +
                                                                                                                                                                            +[components]
                                                                                                                                                                            +
                                                                                                                                                                            +[components.extractor]
                                                                                                                                                                            +@factory = "pdfminer-extractor"
                                                                                                                                                                            +extract_style = true
                                                                                                                                                                            +
                                                                                                                                                                            +[components.classifier]
                                                                                                                                                                            +@factory = "mask-classifier"
                                                                                                                                                                            +x0 = 0.25
                                                                                                                                                                            +x1 = 0.95
                                                                                                                                                                            +y0 = 0.3
                                                                                                                                                                            +y1 = 0.9
                                                                                                                                                                            +threshold = 0.1
                                                                                                                                                                            +"""
                                                                                                                                                                            +
                                                                                                                                                                            +model = edspdf.load(Config.from_str(config))
                                                                                                                                                                            +
                                                                                                                                                                            +# Get a PDF
                                                                                                                                                                            +pdf = Path("/Users/perceval/Development/edspdf/tests/resources/letter.pdf").read_bytes()
                                                                                                                                                                            +
                                                                                                                                                                            +# Construct the DataFrame of blocs
                                                                                                                                                                            +doc = model(pdf)
                                                                                                                                                                            +
                                                                                                                                                                            +# Compute an image representation of each page of the PDF
                                                                                                                                                                            +# overlaid with the predicted bounding boxes
                                                                                                                                                                            +imgs = show_annotations(pdf=pdf, annotations=doc.text_boxes)
                                                                                                                                                                            +
                                                                                                                                                                            +imgs[0]
                                                                                                                                                                            +
                                                                                                                                                                            +

                                                                                                                                                                            If you run this code in a Jupyter notebook, you'll see the following:

                                                                                                                                                                            +

                                                                                                                                                                            lines

                                                                                                                                                                            +

                                                                                                                                                                            Merging blocs together

                                                                                                                                                                            +

                                                                                                                                                                            To help debug a pipeline (or a labelled dataset), you might want to +merge blocs together according to their labels. EDS-PDF provides a merge_lines method +that does just that.

                                                                                                                                                                            +
                                                                                                                                                                            # ↑ Omitted code above ↑
                                                                                                                                                                            +from edspdf.visualization import merge_boxes, show_annotations
                                                                                                                                                                            +
                                                                                                                                                                            +merged = merge_boxes(doc.text_boxes)
                                                                                                                                                                            +
                                                                                                                                                                            +imgs = show_annotations(pdf=pdf, annotations=merged)
                                                                                                                                                                            +imgs[0]
                                                                                                                                                                            +
                                                                                                                                                                            +

                                                                                                                                                                            See the difference:

                                                                                                                                                                            +
                                                                                                                                                                            +
                                                                                                                                                                            +
                                                                                                                                                                            +

                                                                                                                                                                            lines

                                                                                                                                                                            +
                                                                                                                                                                            +
                                                                                                                                                                            +

                                                                                                                                                                            lines

                                                                                                                                                                            +
                                                                                                                                                                            +
                                                                                                                                                                            +
                                                                                                                                                                            +

                                                                                                                                                                            The merge_boxes method uses the notion of maximal cliques to compute merges. +It forbids the combined blocs from overlapping with any bloc from another label.

                                                                                                                                                                            +

                                                                                                                                                                              + + + + + + +
                                                                                                                                                                              +
                                                                                                                                                                              + + +
                                                                                                                                                                              + +
                                                                                                                                                                              + + + +
                                                                                                                                                                              +
                                                                                                                                                                              +
                                                                                                                                                                              +
                                                                                                                                                                              + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/versions.json b/versions.json index 0237fc2f..5581e36f 100644 --- a/versions.json +++ b/versions.json @@ -1 +1 @@ -[{"version": "v0.8.0", "title": "v0.8.0", "aliases": ["latest"]}, {"version": "v0.7.0", "title": "v0.7.0", "aliases": []}, {"version": "v0.5.3", "title": "v0.5.3", "aliases": []}, {"version": "v0.5.2", "title": "v0.5.2", "aliases": []}, {"version": "v0.5.1", "title": "v0.5.1", "aliases": []}, {"version": "v0.5.0", "title": "v0.5.0", "aliases": []}, {"version": "dev", "title": "dev", "aliases": []}] \ No newline at end of file +[{"version": "v0.8.0", "title": "v0.8.0", "aliases": ["latest"]}, {"version": "v0.7.0", "title": "v0.7.0", "aliases": []}, {"version": "v0.5.3", "title": "v0.5.3", "aliases": []}, {"version": "v0.5.2", "title": "v0.5.2", "aliases": []}, {"version": "v0.5.1", "title": "v0.5.1", "aliases": []}, {"version": "v0.5.0", "title": "v0.5.0", "aliases": []}, {"version": "main", "title": "main", "aliases": []}, {"version": "dev", "title": "dev", "aliases": []}] \ No newline at end of file