Query and Download PDFs from Open Raadsinformatie

forums · 16 oktober 2020 om 14:21

This article explains how to query and download PDF documents from the site openraadsinformatie.nl provided by Open State Foundation and VNG using Invantive SQL.

Instructions

To retrieve all PDF documents into a folder of Open Raadsinformatie using the following Invantive SQL statements:

create or replace table urls@inmemorystorage
as
select '' filename
,      '' url
from   dual@datadictionary
where  false

create or replace table output@inmemorystorage
as
select '' filename
,      unicode_to_blob(null) contents
from   dual@datadictionary
where  false

declare
  --
  -- History seems to start at August 1, 2017.
  --
  p_date_start date := to_date('20180101', 'YYYYMMDD');
  p_date_end   date := to_date('20180401', 'YYYYMMDD');
  --
  l_loop   boolean;
  l_date   date;
  l_date_c varchar2;
begin
  l_date := p_date_start;
  while l_date < p_date_end
  loop
    l_date_c := to_char(l_date, 'YYYY-MM-DD');
    insert into urls@InMemoryStorage
    ( filename
    , url
    )
    select case
           when dta.contentType = 'application/pdf'
           then normalize(to_char(dta.id) || '-' || dta.name || '.pdf', 240, false)
           else dta.name
           end
           filename
    ,      dta.originalUrl
    from   httpdownload@datadictionary
           ( url => 'https://api.openraadsinformatie.nl/v1/elastic/ori_*/_search?size=1000&q=date_modified:[' || l_date_c || ' TO ' || l_date_c || ']&pretty=false'
           , method => 'GET'
           ) htp
    join   jsontable
           ( 'hits.hits.[*]'
             passing htp.content_clob
             columns id                  number   path '_id'
             ,       type                varchar2 path '_type'
             ,       name                varchar2 path '_source.name'
             ,       dateModified        varchar2 path '_source.date_modified'
             ,       originalUrl         varchar2 path '_source.original_url'
             ,       contentType         varchar2 path '_source.content_type'
           ) dta
    on     dta.contenttype = 'application/pdf'
    and    dta.originalUrl like 'http%'
    ;
    l_date := l_date + 1;
  end loop;
end;

begin
  for r in ( select * from URLS@InMemoryStorage )
  loop
    --
    -- For now ignore all errors, since httpdownload forwards HTTP errors such as 404.
    --
    begin
      insert into OUTPUT@InMemoryStorage
      ( filename
      , contents
      )
      select r.filename
      ,      fle.content_blob
      from   httpdownload@datadictionary
             ( url => r.url
             , method => 'GET'
             ) fle
      ;
    exception
      when others 
      then 
        null;
    end;
  end loop;
end;

select * 
from   output@inmemorystorage

local export documents in contents to "c:\temp\ori" filename column filename

Topic		Antwoorden	Weergaven
Hoe download ik automatisch alle documenten van een website? Wiki (nl) invantive-sql , httpdownload	0	3316	13 december 2020
Toevoegen PDF-bestanden aan Knack.com tabellen Wiki (nl) knack , rivm	0	701	13 november 2020
Bulk Download Documents from Exact Online Companies Wiki (en) exact-online , get-my-report , invantive-sql , invantive-psql	0	1254	22 december 2020
Download in 1x alle documenten uit Exact Online administratie(s) Wiki (nl) exact-online , invantive-query-tool , get-my-report	4	5006	10 juni 2024
Download PDF van facturen uit Teamleader Wiki (nl) invantive-query-tool , teamleader-focus	0	906	26 april 2021

Query and Download PDFs from Open Raadsinformatie

Instructions

Gerelateerde topics