Compare commits
26 Commits
tiff-threa
...
master
Author | SHA1 | Date |
---|---|---|
Cyberes | 1d222ff2eb | |
Cyberes | 9e286c1f0c | |
Cyberes | a70e078c3f | |
Cyberes | 37ed3dc93a | |
Cyberes | 28ad261c83 | |
Cyberes | 1c257f3102 | |
Cyberes | 09cf9c32f6 | |
Cyberes | 65d3daa057 | |
Cyberes | 47635635b5 | |
Cyberes | c45da1cf0c | |
Cyberes | 0fb26ccf1d | |
Cyberes | 91032c19c9 | |
Cyberes | 1b64189ac9 | |
Cyberes | ace6f641b6 | |
Cyberes | 1f118b1dc9 | |
Cyberes | 1bbbc339a8 | |
Cyberes | 1006fc7d49 | |
Cyberes | 41c8ccfe12 | |
Cyberes | 65e247dae6 | |
Cyberes | d3d818a57e | |
Cyberes | 58d765b88c | |
Cyberes | 359966e97e | |
Cyberes | 3b8a4bbaf1 | |
Cyberes | 89e03a944c | |
Cyberes | 299639f1e8 | |
Cyberes | 923d868c85 |
47
README.md
47
README.md
|
@ -2,6 +2,8 @@
|
|||
|
||||
_Scrape tiles from WMTS servers._
|
||||
|
||||
|
||||
|
||||
You know what I hate? Those godforsaken WMTS servers, perched on their digital thrones, acting like they're the TILE TYRANTS of the universe. They think they can just LOCK UP their precious little tiles and keep me from doing my THING? HA!
|
||||
|
||||
No more will these WMTS servers shroud their CRAPPY-ASS tiles in mystery. I'm coming for your DATA, you binary BASTARDS, and there's not a SINGLE 1 or 0 you can throw at me that will stop my CHARGE.
|
||||
|
@ -10,29 +12,44 @@ You think your firewalls and security mumbo-jumbo can keep me at bay? THINK AGAI
|
|||
|
||||
So, buckle up, WMTS servers. Your reign of TILE TERROR is about to CRASH AND BURN. I'm coming for your DATA, and I'm bringing a whole lot of CHAOS with me.
|
||||
|
||||
### Install
|
||||
|
||||
|
||||
## Install
|
||||
|
||||
It's recommended to use a venv.
|
||||
|
||||
```shell
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### Use
|
||||
|
||||
Example:
|
||||
|
||||
```shell
|
||||
python3 exfiltrate.py \
|
||||
https://wmts.nlsc.gov.tw/wmts/nURBAN/default/EPSG:3857/ \
|
||||
--zoom 20 \
|
||||
--referer https://maps.nlsc.gov.tw/ \
|
||||
--bbox 25.076387 121.68951 25.068282 121.700175 \
|
||||
--threads 30
|
||||
```
|
||||
## Use
|
||||
|
||||
### ArcGIS
|
||||
Some WMTS servers require the correct `Referer` header to be set, otherwise they will reject your request or return blank data. Use `--referer` to set this header.
|
||||
|
||||
???
|
||||
Do `./exfiltrate.py -h` to get more info on what the different command args do.
|
||||
|
||||
### Credits
|
||||
Building the GeoTIFF will take dozens of gigs of memory for any significant extent! For example, a 336 square mile extent required about 280GB of memory. You can use swap, but will need a very fast SSD. I had good results with a Samsung 980 PRO partitioned to swap.
|
||||
|
||||
https://jimmyutterstrom.com/blog/2019/06/05/map-tiles-to-geotiff/
|
||||
Be careful not to go overboard with your spatial extent: use only what you need to avoid unnecessary processing time or else you will easily end up with a situation where it will take a week to download all the tiles and build a TIFF.
|
||||
|
||||
`test.sh` is provided for demonstration. It downloads 856 tiles (less than 2MB) from a WMTS server in Taiwan.
|
||||
|
||||
|
||||
|
||||
## Output
|
||||
|
||||
This program outputs a geo-referenced TIFF image with three bands corresponding to red, green, and blue in the original tiles pixels. If one of the bands has a value of `0`, then that value is adjusted to `1` so that it does not conflict with the `NODATA` value of `0`.
|
||||
|
||||
|
||||
|
||||
## ArcGIS
|
||||
|
||||
The generated TIFF raster should be fully compatible with ArcGIS but if you encounter color issues, try adjusting your symbology.
|
||||
|
||||
|
||||
|
||||
## Inspiration
|
||||
|
||||
https://jimmyutterstrom.com/blog/2019/06/05/map-tiles-to-geotiff/
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import argparse
|
||||
import base64
|
||||
import os
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
|
@ -10,23 +11,32 @@ from PIL import Image
|
|||
from rasterio import Affine
|
||||
from tqdm import tqdm
|
||||
|
||||
from pkg.helpers import convert_seconds
|
||||
from pkg.image import random_file_width
|
||||
from pkg.spatial import deg2num, lonlat_to_meters
|
||||
from pkg.spatial import deg2num
|
||||
from pkg.thread import download_tile
|
||||
|
||||
if __name__ == '__main__':
|
||||
main_start_time = time.time()
|
||||
parser = argparse.ArgumentParser(description='Exfiltrate data from WMS servers.')
|
||||
parser.add_argument('base_url', help='The base URL for the WMS server. Example: https://wmts.nlsc.gov.tw/wmts/nURBAN/default/EPSG:3857/')
|
||||
parser.add_argument('--zoom', type=int, required=True, help='The zoom level to use.')
|
||||
parser.add_argument('--threads', type=int, default=10, help='Number of download threads to use.')
|
||||
parser.add_argument('--bbox', required=True, type=str, metavar='Bounding Box', nargs='+', default=(None, None, None, None), help='Bounding Box of the area to download. Separate each value with a space. (top left lat, top left lon, bottom right lat, bottom right lon)')
|
||||
parser.add_argument('--dl-threads', type=int, default=10, help='Number of download threads to use.')
|
||||
parser.add_argument('--referer', help='The content of the Referer header to send.')
|
||||
parser.add_argument('--output', default='wmts-output', help='Output directory path.')
|
||||
parser.add_argument('--proxy', action='store_true', help='Enable using a proxy.')
|
||||
parser.add_argument('--tiff-threads', default=None, help='Number of threads to use when building TIFF. Default: auto')
|
||||
parser.add_argument('--proxy', action='store_true', help='Enable using a proxy. You must modify pkg/proxies.py')
|
||||
parser.add_argument('--tiff-threads', default=None, type=int, help=f'Number of threads to use when building TIFF. Default: {min(32, (os.cpu_count() or 1) + 4)}') # https://github.com/python/cpython/blob/3.10/Lib/concurrent/futures/thread.py#L142C27-L142C61
|
||||
parser.add_argument('--output-tiff', help='Path for output GeoTIFF. Default: wmts-output/output.tiff')
|
||||
parser.add_argument('--bbox', required=True, type=str, metavar='Bounding Box', nargs='+', default=(None, None, None, None), help='Bounding Box of the area to download. Separate each value with a space. (top left lat, top left lon, bottom right lat, bottom right lon)')
|
||||
parser.add_argument('--no-download', action='store_true', help="Don't do any downloading or image checking.")
|
||||
parser.add_argument('--download-loops', default=1, type=int, help='Sometimes the tiles are downloaded incorrectly. Re-running the download process can fix these corrupted tiles. This arg specifies how many times to run the download process. Default: 1')
|
||||
parser.add_argument('--convert', action='store_true', help="Convert tiles to PNG. Sometimes the server will return transparent GIFs when there are blank tiles so this will convert them to a PNG.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.download_loops <= 0:
|
||||
print('--download-loops must be greater than 0')
|
||||
quit(1)
|
||||
|
||||
args.base_url = args.base_url.strip('/') + f'/{args.zoom}/'
|
||||
base_output = Path(args.output).resolve().absolute().expanduser()
|
||||
url_hash = base64.b64encode(args.base_url.encode()).decode('utf-8').strip('==')
|
||||
|
@ -50,61 +60,103 @@ if __name__ == '__main__':
|
|||
if args.referer:
|
||||
r_headers['Referer'] = args.referer
|
||||
|
||||
tiles = []
|
||||
retries = []
|
||||
total_downloaded = 0
|
||||
row_i = min_row
|
||||
row_iter = range(min_row, max_row + 1)
|
||||
row_bar = tqdm(total=len(row_iter), desc=f'Row {row_i}', postfix={'new_files': total_downloaded, 'failures': len(retries)})
|
||||
for row in row_iter:
|
||||
row_i = row
|
||||
col_iter = range(min_col, max_col + 1)
|
||||
col_bar = tqdm(total=len(col_iter), leave=False)
|
||||
with (ThreadPoolExecutor(args.threads) as executor):
|
||||
futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy)) for col in col_iter]
|
||||
for future in as_completed(futures):
|
||||
result = future.result()
|
||||
if result:
|
||||
result_row, result_col, new_image = result
|
||||
if new_image == 'success':
|
||||
total_downloaded += 1
|
||||
tiles.append((result_row, result_col))
|
||||
elif new_image == 'exist':
|
||||
tiles.append((result_row, result_col))
|
||||
elif new_image == 'failure':
|
||||
retries.append((result_row, result_col))
|
||||
row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)})
|
||||
col_bar.update()
|
||||
row_bar.refresh()
|
||||
col_bar.close()
|
||||
row_bar.set_postfix({'new_files': total_downloaded, 'failures': len(retries)})
|
||||
row_bar.update()
|
||||
retry_files = set()
|
||||
tiles = set()
|
||||
total_new_files = 0
|
||||
total_fixed_files = 0
|
||||
|
||||
row_bar = tqdm(total=0, desc='Row 000 | Loop 0/0', postfix={'new_files': 0, 'failures': 0, 'fixed': 0})
|
||||
for i in range(1, args.download_loops + 1):
|
||||
row_bar.reset()
|
||||
converted_files = 0
|
||||
fixed_files = 0
|
||||
new_files = 0
|
||||
row_i = min_row
|
||||
row_iter = range(min_row, max_row + 1)
|
||||
row_bar.total = len(row_iter)
|
||||
row_bar.desc = f'Row {row_i} | Loop {i}/{args.download_loops}'
|
||||
|
||||
|
||||
def update_bar_postfix():
|
||||
row_bar.set_postfix({'new': new_files, 'failures': len(retry_files), 'fixed': fixed_files, 'converted': converted_files})
|
||||
|
||||
|
||||
update_bar_postfix()
|
||||
|
||||
for row in row_iter:
|
||||
row_i = row
|
||||
col_iter = range(min_col, max_col + 1)
|
||||
if args.no_download:
|
||||
for col in col_iter:
|
||||
tiles.add((row, col))
|
||||
else:
|
||||
col_bar = tqdm(total=len(col_iter), leave=False)
|
||||
|
||||
# On the final download loop (if we are doing multiple), don't convert any files.
|
||||
do_convert = False if args.convert and 1 < args.download_loops == i else args.convert
|
||||
|
||||
with (ThreadPoolExecutor(args.dl_threads) as executor):
|
||||
futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy, do_convert)) for col in col_iter]
|
||||
for future in as_completed(futures):
|
||||
result = future.result()
|
||||
if result:
|
||||
result_row, result_col, new_image = result
|
||||
if new_image == 'success':
|
||||
new_files += 1
|
||||
total_new_files += 1
|
||||
tiles.add((result_row, result_col))
|
||||
elif new_image == 'exist':
|
||||
tiles.add((result_row, result_col))
|
||||
elif new_image == 'failure':
|
||||
retry_files.add((result_row, result_col))
|
||||
elif new_image == 'fixed':
|
||||
tiles.add((result_row, result_col))
|
||||
fixed_files += 1
|
||||
total_fixed_files += 1
|
||||
elif new_image == 'converted':
|
||||
tiles.add((result_row, result_col))
|
||||
converted_files += 1
|
||||
col_bar.update()
|
||||
row_bar.refresh()
|
||||
col_bar.close()
|
||||
update_bar_postfix()
|
||||
row_bar.update()
|
||||
if total_new_files == 0 and total_fixed_files == 0:
|
||||
break
|
||||
row_bar.close()
|
||||
|
||||
col_bar = tqdm(total=len(retries), desc=f'Tile Retries')
|
||||
with ThreadPoolExecutor(args.threads) as executor:
|
||||
futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy)) for row, col in retries]
|
||||
if total_new_files == 0 and total_fixed_files == 0:
|
||||
print('All files downloaded, exiting download loop.')
|
||||
|
||||
col_bar = tqdm(total=len(retry_files), desc=f'Tile Retries')
|
||||
with ThreadPoolExecutor(args.dl_threads) as executor:
|
||||
futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy)) for row, col in retry_files]
|
||||
for future in as_completed(futures):
|
||||
result = future.result()
|
||||
if result:
|
||||
result_row, result_col, new_image = result
|
||||
tiles.append((result_row, result_col))
|
||||
tiles.add((result_row, result_col))
|
||||
if new_image == 'success':
|
||||
total_downloaded += 1
|
||||
new_files += 1
|
||||
elif new_image == 'failure':
|
||||
col_bar.write(f'{(result_row, result_col)} failed!')
|
||||
col_bar.update()
|
||||
col_bar.close()
|
||||
|
||||
print(f'Downloaded {total_downloaded} images.')
|
||||
print(f'Downloaded {new_files} images.')
|
||||
|
||||
print('Determining tile width...', end='')
|
||||
tile_size = random_file_width(tiles_output)
|
||||
print(f' {tile_size}px')
|
||||
|
||||
# Define the number of rows and columns based on the bounding box
|
||||
print('Calculating maximum columns and rows...', end='')
|
||||
num_rows = max_row - min_row + 1
|
||||
num_cols = max_col - min_col + 1
|
||||
print(f' {num_cols}x{num_rows}')
|
||||
|
||||
# Create an empty array to store the image data
|
||||
print(f'Allocating an array with shape {num_rows * tile_size, num_cols * tile_size} and dimension 3...')
|
||||
image_data = np.empty((num_rows * tile_size, num_cols * tile_size, 3), dtype=np.uint8)
|
||||
|
||||
|
||||
|
@ -127,7 +179,7 @@ if __name__ == '__main__':
|
|||
mask = np.any(tile_data == 0, axis=-1) & np.any(tile_data != 0, axis=-1) # Identify pixels where not all bands are zero and at least one band is zero.
|
||||
for i in range(3): # Iterate over each band.
|
||||
# For these pixels, set zero bands to one.
|
||||
tile_data[mask & (tile_data[:, :, i] == 0), i] = 0.1
|
||||
tile_data[mask & (tile_data[:, :, i] == 0), i] = 1
|
||||
|
||||
# Calculate the position of the tile in the image data array.
|
||||
row_pos = (row - min_row) * tile_size
|
||||
|
@ -139,25 +191,27 @@ if __name__ == '__main__':
|
|||
|
||||
with ThreadPoolExecutor(max_workers=args.tiff_threads) as executor:
|
||||
futures = {executor.submit(build_tiff_data, task) for task in tiles}
|
||||
for future in tqdm(as_completed(futures), total=len(futures), desc='Building TIFF'):
|
||||
pass
|
||||
bar = tqdm(total=len(futures), desc='Building TIFF', postfix='There may be a lengthy startup time, please be patient!')
|
||||
for future in as_completed(futures):
|
||||
bar.set_postfix()
|
||||
bar.update()
|
||||
bar.close()
|
||||
|
||||
# Transpose the image data array to the format (bands, rows, cols).
|
||||
print('Transposing...')
|
||||
image_data = np.transpose(image_data, (2, 0, 1))
|
||||
|
||||
# Convert geographic coordinates to Web Mercator coordinates. Not 100% sure this is nessesary.
|
||||
top_left_mx, top_left_my = lonlat_to_meters(top_left_lon, top_left_lat)
|
||||
bottom_right_mx, bottom_right_my = lonlat_to_meters(bottom_right_lon, bottom_right_lat)
|
||||
|
||||
# Define the transformation from pixel coordinates to geographic coordinates, which is an Affine transformation that
|
||||
# maps pixel coordinates in the image to geographic coordinates on the Earth's surface.
|
||||
print('Calculating transform...')
|
||||
transform = (Affine.translation(top_left_lon, top_left_lat) # Create a translation transformation that shifts the image and set the origin of the image to the top-left corner of the bounding box.
|
||||
# Create a scaling transformation that scales the image in the x and y directions to convert the pixel coordinates of the image to the geographic coordinates of the bounding box.
|
||||
* Affine.scale((bottom_right_lon - top_left_lon) / image_data.shape[2], (bottom_right_lat - top_left_lat) / image_data.shape[1]))
|
||||
|
||||
# Write the image data to a GeoTIFF file
|
||||
print('Saving to:', output_tiff)
|
||||
start = time.time()
|
||||
print('Writing TIFF to:', output_tiff)
|
||||
start_write_tiff = time.time()
|
||||
with rasterio.open(output_tiff, "w", driver="GTiff", height=num_rows * tile_size, width=num_cols * tile_size, count=3, dtype=str(image_data.dtype), crs='EPSG:4326', transform=transform, compress="DEFLATE", nodata=0) as dst:
|
||||
dst.write(image_data, indexes=[1, 2, 3])
|
||||
print(f'Saved in {int(time.time() - start)} seconds.')
|
||||
now = time.time()
|
||||
print(f'Time to write TIFF: {convert_seconds(int(now - start_write_tiff))} seconds. Total run time: {convert_seconds(int(now - main_start_time))}')
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
def convert_seconds(seconds):
|
||||
hours = seconds // 3600
|
||||
seconds %= 3600
|
||||
minutes = seconds // 60
|
||||
seconds %= 60
|
||||
return "%d:%02d:%02d" % (hours, minutes, seconds)
|
22
pkg/image.py
22
pkg/image.py
|
@ -1,7 +1,9 @@
|
|||
import random
|
||||
from pathlib import Path
|
||||
|
||||
import PIL
|
||||
from PIL import Image
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def random_file_width(base_path: Path):
|
||||
|
@ -19,6 +21,22 @@ def is_png(file_path):
|
|||
"""
|
||||
try:
|
||||
img = Image.open(file_path)
|
||||
return img.format == 'PNG'
|
||||
except:
|
||||
img.verify()
|
||||
return img.format == 'PNG', img.format
|
||||
except PIL.UnidentifiedImageError as e:
|
||||
# tqdm.write(str(e))
|
||||
return False, None
|
||||
|
||||
|
||||
def convert_to_png(file_path):
|
||||
try:
|
||||
img = Image.open(file_path)
|
||||
if img.format != 'PNG':
|
||||
img.save(file_path, format='PNG')
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
except Exception:
|
||||
# import traceback
|
||||
# traceback.print_exc()
|
||||
return False
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# Set your proxies here.
|
||||
|
||||
PROXIES = {
|
||||
"http": 'http://172.0.4.7:9000',
|
||||
"https": 'http://172.0.4.7:9000',
|
||||
|
|
|
@ -5,7 +5,7 @@ import requests
|
|||
from tqdm import tqdm
|
||||
|
||||
from pkg.proxies import PROXIES
|
||||
from .image import is_png
|
||||
from .image import convert_to_png, is_png
|
||||
|
||||
|
||||
def del_path(p: Path):
|
||||
|
@ -16,27 +16,55 @@ def del_path(p: Path):
|
|||
|
||||
|
||||
def download_tile(task):
|
||||
row, col, base_url, r_headers, output, use_proxy = task
|
||||
row, col, base_url, r_headers, output, use_proxy, do_convert_to_png = task
|
||||
corrupted_image = False
|
||||
try:
|
||||
output_path: Path = output / f"{row}_{col}.png"
|
||||
if output_path.exists():
|
||||
if not is_png(output_path):
|
||||
# Delete the file and try again.
|
||||
del_path(output_path)
|
||||
tqdm.write(f'{output_path} is not a PNG, deleting and retrying...')
|
||||
valid_png_file, image_type = is_png(output_path)
|
||||
if do_convert_to_png and image_type and image_type != 'PNG':
|
||||
# The image was read sucessfully by PIL but it's in the wrong format.
|
||||
coverted = convert_to_png(output_path)
|
||||
if not is_png(output_path):
|
||||
tqdm.write(f'PNG conversion for {output_path} failed. Deleting and retrying...')
|
||||
corrupted_image = True
|
||||
else:
|
||||
return row, col, 'converted' if coverted else 'exist'
|
||||
elif not valid_png_file:
|
||||
# We will re-download the image. Don't need to delete it, just overwrite it.
|
||||
# del_path(output_path)
|
||||
corrupted_image = True
|
||||
tqdm.write(f'Bad image file: "{output_path}" (is format: {image_type}), deleting and retrying...')
|
||||
else:
|
||||
return row, col, 'exist'
|
||||
tile_url = f"{base_url}/{row}/{col}".replace('//', '/').replace(':/', '://')
|
||||
response = requests.get(tile_url, headers=r_headers, proxies=PROXIES if use_proxy else None, timeout=60)
|
||||
if response.status_code == 200:
|
||||
if not response.headers.get('Content-Type') == 'image/png':
|
||||
raise Exception(f'Response gave Content-Type: {response.headers.get("Content-Type")}')
|
||||
# if not do_convert_to_png and not response.headers.get('Content-Type') == 'image/png':
|
||||
# # If we will convert the image to a PNG, ignore this header.
|
||||
# raise Exception(f'Response gave Content-Type: {response.headers.get("Content-Type")}')
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(response.content)
|
||||
return row, col, 'success'
|
||||
|
||||
if do_convert_to_png:
|
||||
convert_to_png(output_path)
|
||||
if not is_png(output_path)[0]:
|
||||
tqdm.write(f'PNG conversion for {output_path} failed')
|
||||
else:
|
||||
return row, col, 'success' if not corrupted_image else 'fixed'
|
||||
else:
|
||||
# Recheck the PNG if it was corrupted.
|
||||
valid_png_file, image_type = is_png(output_path)
|
||||
if not valid_png_file:
|
||||
tqdm.write(f'Bad image file: "{output_path}" (is format: {image_type}).')
|
||||
return row, col, 'failure'
|
||||
else:
|
||||
return row, col, 'success' if not corrupted_image else 'fixed'
|
||||
else:
|
||||
print(f"Failed to download tile {row}_{col}")
|
||||
return row, col, 'failure'
|
||||
except Exception as e:
|
||||
# import traceback
|
||||
# traceback.print_exc()
|
||||
tqdm.write(f'Exception on {(row, col)} - {e.__class__.__name__}: {e}')
|
||||
return row, col, 'failure'
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
venv/bin/python3 exfiltrate.py \
|
||||
https://wmts.nlsc.gov.tw/wmts/nURBAN2/default/EPSG:3857/ \
|
||||
--zoom 20 \
|
||||
--referer https://maps.nlsc.gov.tw/ \
|
||||
--bbox 25.076387 121.68951 25.068282 121.700175 \
|
||||
--dl-threads 30 \
|
||||
--output ~/Downloads/wmts-output/ \
|
||||
--download-loops 2 \
|
||||
--convert
|
Loading…
Reference in New Issue