Compare commits

..

3 Commits

Author SHA1 Message Date
Cyberes 6acbdf3117 break tiffs into chunks 2023-11-06 18:42:52 -07:00
Cyberes 65953c9bde add tiff threading 2023-11-06 18:28:44 -07:00
Cyberes 664eb1a52f optimize building geotiff 2023-11-06 17:48:09 -07:00
7 changed files with 158 additions and 248 deletions

View File

@ -2,54 +2,46 @@
_Scrape tiles from WMTS servers._
You know what I hate? Those godforsaken WMTS servers, perched on their digital thrones, acting like they're the TILE
TYRANTS of the universe. They think they can just LOCK UP their precious little tiles and keep me from doing my THING?
HA!
No more will these WMTS servers shroud their CRAPPY-ASS tiles in mystery. I'm coming for your DATA, you binary BASTARDS,
and there's not a SINGLE 1 or 0 you can throw at me that will stop my CHARGE.
You know what I hate? Those godforsaken WMTS servers, perched on their digital thrones, acting like they're the TILE TYRANTS of the universe. They think they can just LOCK UP their precious little tiles and keep me from doing my THING? HA!
You think your firewalls and security mumbo-jumbo can keep me at bay? THINK AGAIN. I'll slice through your defenses like
a HOT PIZZA through COLD BUTTER. I'll have your DATA, and there's absolutely NOTHING, I repeat, NOTHING you can do to
STOP ME.
No more will these WMTS servers shroud their CRAPPY-ASS tiles in mystery. I'm coming for your DATA, you binary BASTARDS, and there's not a SINGLE 1 or 0 you can throw at me that will stop my CHARGE.
So, buckle up, WMTS servers. Your reign of TILE TERROR is about to CRASH AND BURN. I'm coming for your DATA, and I'm
bringing a whole lot of CHAOS with me.
You think your firewalls and security mumbo-jumbo can keep me at bay? THINK AGAIN. I'll slice through your defenses like a HOT PIZZA through COLD BUTTER. I'll have your DATA, and there's absolutely NOTHING, I repeat, NOTHING you can do to STOP ME.
So, buckle up, WMTS servers. Your reign of TILE TERROR is about to CRASH AND BURN. I'm coming for your DATA, and I'm bringing a whole lot of CHAOS with me.
## Install
It's recommended to use a venv.
### Install
```shell
pip install -r requirements.txt
```
### Use
Example:
## Use
```shell
python3 exfiltrate.py \
https://wmts.nlsc.gov.tw/wmts/nURBAN/default/EPSG:3857/ \
--zoom 20 \
--referer https://maps.nlsc.gov.tw/ \
--bbox 25.076387 121.68951 25.068282 121.700175 \
--threads 30
```
Some WMTS servers require the correct `Referer` header to be set, otherwise they will reject your request or return blank data. Use `--referer` to set this header.
Building the GeoTIFF will take dozens of gigs of memory for any significant extent! For example, a 21 mile extent
required about 400GB of memory. You can use swap for this, but don't expect it to be very quick if you go this route.
Do `./exfiltrate.py -h` to get more info on what the different command args do.
### ArcGIS
Building the GeoTIFF will take dozens of gigs of memory for any significant extent! For example, a 336 square mile extent required about 280GB of memory. You can use swap, but will need a very fast SSD. I had good results with a Samsung 980 PRO partitioned to swap.
TODO
Be careful not to go overboard with your spatial extent: use only what you need to avoid unnecessary processing time or else you will easily end up with a situation where it will take a week to download all the tiles and build a TIFF.
### Credits
`test.sh` is provided for demonstration. It downloads 856 tiles (less than 2MB) from a WMTS server in Taiwan.
## Output
This program outputs a geo-referenced TIFF image with three bands corresponding to red, green, and blue in the original tiles pixels. If one of the bands has a value of `0`, then that value is adjusted to `1` so that it does not conflict with the `NODATA` value of `0`.
## ArcGIS
The generated TIFF raster should be fully compatible with ArcGIS but if you encounter color issues, try adjusting your symbology.
## Inspiration
https://jimmyutterstrom.com/blog/2019/06/05/map-tiles-to-geotiff/
https://jimmyutterstrom.com/blog/2019/06/05/map-tiles-to-geotiff/

264
exfiltrate.py Executable file → Normal file
View File

@ -1,9 +1,9 @@
import argparse
import base64
import os
import time
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from queue import Queue
import numpy as np
import rasterio
@ -11,32 +11,25 @@ from PIL import Image
from rasterio import Affine
from tqdm import tqdm
from pkg.helpers import convert_seconds
from pkg.image import random_file_width
from pkg.spatial import deg2num
from pkg.thread import download_tile
if __name__ == '__main__':
main_start_time = time.time()
parser = argparse.ArgumentParser(description='Exfiltrate data from WMS servers.')
parser.add_argument('base_url', help='The base URL for the WMS server. Example: https://wmts.nlsc.gov.tw/wmts/nURBAN/default/EPSG:3857/')
parser.add_argument('--zoom', type=int, required=True, help='The zoom level to use.')
parser.add_argument('--bbox', required=True, type=str, metavar='Bounding Box', nargs='+', default=(None, None, None, None), help='Bounding Box of the area to download. Separate each value with a space. (top left lat, top left lon, bottom right lat, bottom right lon)')
parser.add_argument('--dl-threads', type=int, default=10, help='Number of download threads to use.')
parser.add_argument('--threads', type=int, default=10, help='Number of download threads to use.')
parser.add_argument('--referer', help='The content of the Referer header to send.')
parser.add_argument('--output', default='wmts-output', help='Output directory path.')
parser.add_argument('--proxy', action='store_true', help='Enable using a proxy. You must modify pkg/proxies.py')
parser.add_argument('--tiff-threads', default=None, type=int, help=f'Number of threads to use when building TIFF. Default: {min(32, (os.cpu_count() or 1) + 4)}') # https://github.com/python/cpython/blob/3.10/Lib/concurrent/futures/thread.py#L142C27-L142C61
parser.add_argument('--proxy', action='store_true', help='Enable using a proxy.')
parser.add_argument('--tiff-threads', default=10, type=int, help='Number of threads to use when building TIFF. Default: auto')
parser.add_argument('--output-tiff', help='Path for output GeoTIFF. Default: wmts-output/output.tiff')
parser.add_argument('--bbox', required=True, type=str, metavar='Bounding Box', nargs='+', default=(None, None, None, None), help='Bounding Box of the area to download. Separate each value with a space. (top left lat, top left lon, bottom right lat, bottom right lon)')
# parser.add_argument('--extent', default=None, help='Specify an extent to break the output image to. This is the diagonal.')
parser.add_argument('--no-download', action='store_true', help="Don't do any downloading or image checking.")
parser.add_argument('--download-loops', default=1, type=int, help='Sometimes the tiles are downloaded incorrectly. Re-running the download process can fix these corrupted tiles. This arg specifies how many times to run the download process. Default: 1')
parser.add_argument('--convert', action='store_true', help="Convert tiles to PNG. Sometimes the server will return transparent GIFs when there are blank tiles so this will convert them to a PNG.")
args = parser.parse_args()
if args.download_loops <= 0:
print('--download-loops must be greater than 0')
quit(1)
args.base_url = args.base_url.strip('/') + f'/{args.zoom}/'
base_output = Path(args.output).resolve().absolute().expanduser()
url_hash = base64.b64encode(args.base_url.encode()).decode('utf-8').strip('==')
@ -60,158 +53,137 @@ if __name__ == '__main__':
if args.referer:
r_headers['Referer'] = args.referer
retry_files = set()
tiles = set()
total_new_files = 0
total_fixed_files = 0
tiles = []
retries = []
total_downloaded = 0
row_i = min_row
row_iter = range(min_row, max_row + 1)
row_bar = tqdm(total=len(row_iter), desc=f'Row {row_i}', postfix={'new_files': total_downloaded, 'failures': len(retries)})
for row in row_iter:
row_i = row
col_iter = range(min_col, max_col + 1)
col_bar = tqdm(total=len(col_iter), leave=False)
row_bar = tqdm(total=0, desc='Row 000 | Loop 0/0', postfix={'new_files': 0, 'failures': 0, 'fixed': 0})
for i in range(1, args.download_loops + 1):
row_bar.reset()
converted_files = 0
fixed_files = 0
new_files = 0
row_i = min_row
row_iter = range(min_row, max_row + 1)
row_bar.total = len(row_iter)
row_bar.desc = f'Row {row_i} | Loop {i}/{args.download_loops}'
def update_bar_postfix():
row_bar.set_postfix({'new': new_files, 'failures': len(retry_files), 'fixed': fixed_files, 'converted': converted_files})
update_bar_postfix()
for row in row_iter:
row_i = row
col_iter = range(min_col, max_col + 1)
if args.no_download:
for col in col_iter:
tiles.add((row, col))
else:
col_bar = tqdm(total=len(col_iter), leave=False)
# On the final download loop (if we are doing multiple), don't convert any files.
do_convert = False if args.convert and 1 < args.download_loops == i else args.convert
with (ThreadPoolExecutor(args.dl_threads) as executor):
futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy, do_convert)) for col in col_iter]
for future in as_completed(futures):
result = future.result()
if result:
result_row, result_col, new_image = result
if new_image == 'success':
new_files += 1
total_new_files += 1
tiles.add((result_row, result_col))
elif new_image == 'exist':
tiles.add((result_row, result_col))
elif new_image == 'failure':
retry_files.add((result_row, result_col))
elif new_image == 'fixed':
tiles.add((result_row, result_col))
fixed_files += 1
total_fixed_files += 1
elif new_image == 'converted':
tiles.add((result_row, result_col))
converted_files += 1
col_bar.update()
row_bar.refresh()
col_bar.close()
update_bar_postfix()
row_bar.update()
if total_new_files == 0 and total_fixed_files == 0:
break
if args.no_download:
for col in col_iter:
tiles.append((row, col))
else:
with (ThreadPoolExecutor(args.threads) as executor):
futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy)) for col in col_iter]
for future in as_completed(futures):
result = future.result()
if result:
result_row, result_col, new_image = result
if new_image == 'success':
total_downloaded += 1
tiles.append((result_row, result_col))
elif new_image == 'exist':
tiles.append((result_row, result_col))
elif new_image == 'failure':
retries.append((result_row, result_col))
row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)})
col_bar.update()
row_bar.refresh()
col_bar.close()
row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)})
row_bar.update()
row_bar.close()
if total_new_files == 0 and total_fixed_files == 0:
print('All files downloaded, exiting download loop.')
col_bar = tqdm(total=len(retry_files), desc=f'Tile Retries')
with ThreadPoolExecutor(args.dl_threads) as executor:
futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy)) for row, col in retry_files]
col_bar = tqdm(total=len(retries), desc=f'Tile Retries')
with ThreadPoolExecutor(args.threads) as executor:
futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy)) for row, col in retries]
for future in as_completed(futures):
result = future.result()
if result:
result_row, result_col, new_image = result
tiles.add((result_row, result_col))
tiles.append((result_row, result_col))
if new_image == 'success':
new_files += 1
total_downloaded += 1
elif new_image == 'failure':
col_bar.write(f'{(result_row, result_col)} failed!')
col_bar.update()
col_bar.close()
print(f'Downloaded {new_files} images.')
print(f'Downloaded {total_downloaded} images.')
print('Preparing data...')
print('Determining tile width...', end='')
tile_size = random_file_width(tiles_output)
print(f' {tile_size}px')
# Define the number of rows and columns based on the bounding box
print('Calculating maximum columns and rows...', end='')
num_rows = max_row - min_row + 1
num_cols = max_col - min_col + 1
print(f' {num_cols}x{num_rows}')
# Create an empty array to store the image data
print(f'Allocating an array with shape {num_rows * tile_size, num_cols * tile_size} and dimension 3...')
image_data = np.empty((num_rows * tile_size, num_cols * tile_size, 3), dtype=np.uint8)
def build_tiff_data(task):
row, col = task
tile_file = tiles_output / f"{row}_{col}.png"
if not tile_file.is_file():
raise Exception(f'Tile does not exist: {tile_file}')
with Image.open(tile_file) as img:
tile_data = np.array(img)
# Remove the alpha channel
tile_data = tile_data[:, :, :3]
# Replace white pixels with NODATA
tile_data[np.all(tile_data == [255, 255, 255], axis=-1)] = [0, 0, 0]
# ArcGIS does not like pixels that have zeros in them, eg. (255, 0, 0). We need to convert the zeros to ones, eg. (255, 1, 1).
mask = np.any(tile_data == 0, axis=-1) & np.any(tile_data != 0, axis=-1) # Identify pixels where not all bands are zero and at least one band is zero.
for i in range(3): # Iterate over each band.
# For these pixels, set zero bands to one.
tile_data[mask & (tile_data[:, :, i] == 0), i] = 1
# Calculate the position of the tile in the image data array.
row_pos = (row - min_row) * tile_size
col_pos = (col - min_col) * tile_size
# Insert the tile data into the image data array at the correct spot.
image_data[row_pos:row_pos + tile_size, col_pos:col_pos + tile_size] = tile_data
with ThreadPoolExecutor(max_workers=args.tiff_threads) as executor:
futures = {executor.submit(build_tiff_data, task) for task in tiles}
bar = tqdm(total=len(futures), desc='Building TIFF', postfix='There may be a lengthy startup time, please be patient!')
for future in as_completed(futures):
bar.set_postfix()
bar.update()
bar.close()
# Transpose the image data array to the format (bands, rows, cols).
print('Transposing...')
image_data = np.transpose(image_data, (2, 0, 1))
# Define the transformation from pixel coordinates to geographic coordinates, which is an Affine transformation that
# maps pixel coordinates in the image to geographic coordinates on the Earth's surface.
print('Calculating transform...')
transform = (Affine.translation(top_left_lon, top_left_lat) # Create a translation transformation that shifts the image and set the origin of the image to the top-left corner of the bounding box.
# Create a scaling transformation that scales the image in the x and y directions to convert the pixel coordinates of the image to the geographic coordinates of the bounding box.
* Affine.scale((bottom_right_lon - top_left_lon) / image_data.shape[2], (bottom_right_lat - top_left_lat) / image_data.shape[1]))
transform = (Affine.translation(top_left_lon, top_left_lat)
* Affine.scale((bottom_right_lon - top_left_lon) / (num_cols * tile_size),
(bottom_right_lat - top_left_lat) / (num_rows * tile_size)))
# Write the image data to a GeoTIFF file
print('Writing TIFF to:', output_tiff)
start_write_tiff = time.time()
with rasterio.open(output_tiff, "w", driver="GTiff", height=num_rows * tile_size, width=num_cols * tile_size, count=3, dtype=str(image_data.dtype), crs='EPSG:4326', transform=transform, compress="DEFLATE", nodata=0) as dst:
dst.write(image_data, indexes=[1, 2, 3])
now = time.time()
print(f'Time to write TIFF: {convert_seconds(int(now - start_write_tiff))} seconds. Total run time: {convert_seconds(int(now - main_start_time))}')
# Divide the tiles into n groups
tile_groups = np.array_split(tiles, args.tiff_threads)
def worker(pbar, output_tiff, tile_group):
with rasterio.open(output_tiff, "w", driver="GTiff", height=num_rows * tile_size, width=num_cols * tile_size, count=3, dtype='uint8', crs='EPSG:4326', transform=transform, compress="DEFLATE", nodata=0) as dst:
while True:
task = q.get()
if task is None:
break
row, col = task
tile_file = tiles_output / f"{row}_{col}.png"
if not tile_file.is_file():
raise Exception(f'Tile does not exist: {tile_file}')
with Image.open(tile_file) as img:
tile_data = np.array(img, dtype=np.uint8)
# Remove the alpha channel
tile_data = tile_data[:, :, :3]
# Replace white pixels with NODATA
tile_data[np.all(tile_data == [255, 255, 255], axis=-1)] = [0, 0, 0]
# ArcGIS does not like pixels that have zeros in them, eg. (255, 0, 0). We need to convert the zeros to ones, eg. (255, 1, 1).
mask = np.any(tile_data == 0, axis=-1) & np.any(tile_data != 0, axis=-1) # Identify pixels where not all bands are zero and at least one band is zero.
for i in range(3): # Iterate over each band.
# For these pixels, set zero bands to one.
tile_data[mask & (tile_data[:, :, i] == 0), i] = 1
# Calculate the position of the tile in the image data array.
row_pos = (row - min_row) * tile_size
col_pos = (col - min_col) * tile_size
tile_data = np.transpose(tile_data, (2, 0, 1))
# Write the tile data to the GeoTIFF file
dst.write(tile_data, window=rasterio.windows.Window(col_pos, row_pos, tile_size, tile_size), indexes=[1, 2, 3])
q.task_done()
pbar.update()
q = Queue()
lock = threading.Lock()
with tqdm(total=len(tiles), desc='Building GeoTIFF') as pbar:
threads = []
for i in range(args.tiff_threads):
output_tiff_thread = output_tiff.with_stem(output_tiff.stem + f'_part{i}')
t = threading.Thread(target=worker, args=(pbar, output_tiff_thread, tile_groups[i]))
t.start()
threads.append(t)
for i, tile_group in enumerate(tile_groups):
for row, col in tile_group:
q.put((row, col))
# block until all tasks are done
q.join()
# stop workers
for i in range(args.tiff_threads):
q.put(None)
for t in threads:
t.join()

View File

@ -1,6 +0,0 @@
def convert_seconds(seconds):
hours = seconds // 3600
seconds %= 3600
minutes = seconds // 60
seconds %= 60
return "%d:%02d:%02d" % (hours, minutes, seconds)

View File

@ -3,7 +3,6 @@ from pathlib import Path
import PIL
from PIL import Image
from tqdm import tqdm
def random_file_width(base_path: Path):
@ -21,22 +20,7 @@ def is_png(file_path):
"""
try:
img = Image.open(file_path)
img.verify()
return img.format == 'PNG', img.format
return img.format == 'PNG'
except PIL.UnidentifiedImageError as e:
# tqdm.write(str(e))
return False, None
def convert_to_png(file_path):
try:
img = Image.open(file_path)
if img.format != 'PNG':
img.save(file_path, format='PNG')
return True
else:
return False
except Exception:
# import traceback
# traceback.print_exc()
return False

View File

@ -1,5 +1,3 @@
# Set your proxies here.
PROXIES = {
"http": 'http://172.0.4.7:9000',
"https": 'http://172.0.4.7:9000',

View File

@ -5,7 +5,7 @@ import requests
from tqdm import tqdm
from pkg.proxies import PROXIES
from .image import convert_to_png, is_png
from .image import is_png
def del_path(p: Path):
@ -16,55 +16,27 @@ def del_path(p: Path):
def download_tile(task):
row, col, base_url, r_headers, output, use_proxy, do_convert_to_png = task
corrupted_image = False
row, col, base_url, r_headers, output, use_proxy = task
try:
output_path: Path = output / f"{row}_{col}.png"
if output_path.exists():
valid_png_file, image_type = is_png(output_path)
if do_convert_to_png and image_type and image_type != 'PNG':
# The image was read sucessfully by PIL but it's in the wrong format.
coverted = convert_to_png(output_path)
if not is_png(output_path):
tqdm.write(f'PNG conversion for {output_path} failed. Deleting and retrying...')
corrupted_image = True
else:
return row, col, 'converted' if coverted else 'exist'
elif not valid_png_file:
# We will re-download the image. Don't need to delete it, just overwrite it.
# del_path(output_path)
corrupted_image = True
tqdm.write(f'Bad image file: "{output_path}" (is format: {image_type}), deleting and retrying...')
if not is_png(output_path):
# Delete the file and try again.
del_path(output_path)
tqdm.write(f'cannot identify image file: "{output_path}", deleting and retrying...')
else:
return row, col, 'exist'
tile_url = f"{base_url}/{row}/{col}".replace('//', '/').replace(':/', '://')
response = requests.get(tile_url, headers=r_headers, proxies=PROXIES if use_proxy else None, timeout=60)
if response.status_code == 200:
# if not do_convert_to_png and not response.headers.get('Content-Type') == 'image/png':
# # If we will convert the image to a PNG, ignore this header.
# raise Exception(f'Response gave Content-Type: {response.headers.get("Content-Type")}')
if not response.headers.get('Content-Type') == 'image/png':
raise Exception(f'Response gave Content-Type: {response.headers.get("Content-Type")}')
with open(output_path, "wb") as f:
f.write(response.content)
if do_convert_to_png:
convert_to_png(output_path)
if not is_png(output_path)[0]:
tqdm.write(f'PNG conversion for {output_path} failed')
else:
return row, col, 'success' if not corrupted_image else 'fixed'
else:
# Recheck the PNG if it was corrupted.
valid_png_file, image_type = is_png(output_path)
if not valid_png_file:
tqdm.write(f'Bad image file: "{output_path}" (is format: {image_type}).')
return row, col, 'failure'
else:
return row, col, 'success' if not corrupted_image else 'fixed'
return row, col, 'success'
else:
print(f"Failed to download tile {row}_{col}")
return row, col, 'failure'
except Exception as e:
# import traceback
# traceback.print_exc()
tqdm.write(f'Exception on {(row, col)} - {e.__class__.__name__}: {e}')
return row, col, 'failure'

View File

@ -1,9 +1,7 @@
venv/bin/python3 exfiltrate.py \
https://wmts.nlsc.gov.tw/wmts/nURBAN2/default/EPSG:3857/ \
https://wmts.nlsc.gov.tw/wmts/nURBAN/default/EPSG:3857/ \
--zoom 20 \
--referer https://maps.nlsc.gov.tw/ \
--bbox 25.076387 121.68951 25.068282 121.700175 \
--dl-threads 30 \
--output ~/Downloads/wmts-output/ \
--download-loops 2 \
--convert
--threads 30 --tiff-threads 100 \
--output ~/Downloads/wmts-output/ --no-download