import argparse import base64 import os import time from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path import numpy as np import rasterio from PIL import Image from rasterio import Affine from tqdm import tqdm from pkg.helpers import convert_seconds from pkg.image import random_file_width from pkg.spatial import deg2num from pkg.thread import download_tile if __name__ == '__main__': main_start_time = time.time() parser = argparse.ArgumentParser(description='Exfiltrate data from WMS servers.') parser.add_argument('base_url', help='The base URL for the WMS server. Example: https://wmts.nlsc.gov.tw/wmts/nURBAN/default/EPSG:3857/') parser.add_argument('--zoom', type=int, required=True, help='The zoom level to use.') parser.add_argument('--bbox', required=True, type=str, metavar='Bounding Box', nargs='+', default=(None, None, None, None), help='Bounding Box of the area to download. Separate each value with a space. (top left lat, top left lon, bottom right lat, bottom right lon)') parser.add_argument('--dl-threads', type=int, default=10, help='Number of download threads to use.') parser.add_argument('--referer', help='The content of the Referer header to send.') parser.add_argument('--output', default='wmts-output', help='Output directory path.') parser.add_argument('--proxy', action='store_true', help='Enable using a proxy. You must modify pkg/proxies.py') parser.add_argument('--tiff-threads', default=None, type=int, help=f'Number of threads to use when building TIFF. Default: {min(32, (os.cpu_count() or 1) + 4)}') # https://github.com/python/cpython/blob/3.10/Lib/concurrent/futures/thread.py#L142C27-L142C61 parser.add_argument('--output-tiff', help='Path for output GeoTIFF. Default: wmts-output/output.tiff') parser.add_argument('--no-download', action='store_true', help="Don't do any downloading or image checking.") parser.add_argument('--download-loops', default=1, type=int, help='Sometimes the tiles are downloaded incorrectly. Re-running the download process can fix these corrupted tiles. This arg specifies how many times to run the download process. Default: 1') parser.add_argument('--convert', action='store_true', help="Convert tiles to PNG. Sometimes the server will return transparent GIFs when there are blank tiles so this will convert them to a PNG.") args = parser.parse_args() if args.download_loops <= 0: print('--download-loops must be greater than 0') quit(1) args.base_url = args.base_url.strip('/') + f'/{args.zoom}/' base_output = Path(args.output).resolve().absolute().expanduser() url_hash = base64.b64encode(args.base_url.encode()).decode('utf-8').strip('==') tiles_output = base_output / url_hash / str(args.zoom) tiles_output.mkdir(parents=True, exist_ok=True) top_left_lat, top_left_lon, bottom_right_lat, bottom_right_lon = map(float, args.bbox) min_col, min_row = deg2num(top_left_lat, top_left_lon, args.zoom) max_col, max_row = deg2num(bottom_right_lat, bottom_right_lon, args.zoom) if args.output_tiff: output_tiff = Path(args.output_tiff) else: output_tiff = base_output / f'output-z{args.zoom}-{top_left_lat}x{top_left_lon}-{bottom_right_lat}x{bottom_right_lon}.tiff' r_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', 'Accept': 'image/avif,image/webp,*/*', 'Accept-Language': 'en-US,en;q=0.5' } if args.referer: r_headers['Referer'] = args.referer retry_files = set() tiles = set() total_new_files = 0 total_fixed_files = 0 row_bar = tqdm(total=0, desc='Row 000 | Loop 0/0', postfix={'new_files': 0, 'failures': 0, 'fixed': 0}) for i in range(1, args.download_loops + 1): row_bar.reset() converted_files = 0 fixed_files = 0 new_files = 0 row_i = min_row row_iter = range(min_row, max_row + 1) row_bar.total = len(row_iter) row_bar.desc = f'Row {row_i} | Loop {i}/{args.download_loops}' def update_bar_postfix(): row_bar.set_postfix({'new': new_files, 'failures': len(retry_files), 'fixed': fixed_files, 'converted': converted_files}) update_bar_postfix() for row in row_iter: row_i = row col_iter = range(min_col, max_col + 1) if args.no_download: for col in col_iter: tiles.add((row, col)) else: col_bar = tqdm(total=len(col_iter), leave=False) # On the final download loop (if we are doing multiple), don't convert any files. do_convert = False if args.convert and 1 < args.download_loops == i else args.convert with (ThreadPoolExecutor(args.dl_threads) as executor): futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy, do_convert)) for col in col_iter] for future in as_completed(futures): result = future.result() if result: result_row, result_col, new_image = result if new_image == 'success': new_files += 1 total_new_files += 1 tiles.add((result_row, result_col)) elif new_image == 'exist': tiles.add((result_row, result_col)) elif new_image == 'failure': retry_files.add((result_row, result_col)) elif new_image == 'fixed': tiles.add((result_row, result_col)) fixed_files += 1 total_fixed_files += 1 elif new_image == 'converted': tiles.add((result_row, result_col)) converted_files += 1 col_bar.update() row_bar.refresh() col_bar.close() update_bar_postfix() row_bar.update() if total_new_files == 0 and total_fixed_files == 0: break row_bar.close() if total_new_files == 0 and total_fixed_files == 0: print('All files downloaded, exiting download loop.') col_bar = tqdm(total=len(retry_files), desc=f'Tile Retries') with ThreadPoolExecutor(args.dl_threads) as executor: futures = [executor.submit(download_tile, (row, col, args.base_url, r_headers, tiles_output, args.proxy)) for row, col in retry_files] for future in as_completed(futures): result = future.result() if result: result_row, result_col, new_image = result tiles.add((result_row, result_col)) if new_image == 'success': new_files += 1 elif new_image == 'failure': col_bar.write(f'{(result_row, result_col)} failed!') col_bar.update() col_bar.close() print(f'Downloaded {new_files} images.') print('Determining tile width...', end='') tile_size = random_file_width(tiles_output) print(f' {tile_size}px') # Define the number of rows and columns based on the bounding box print('Calculating maximum columns and rows...', end='') num_rows = max_row - min_row + 1 num_cols = max_col - min_col + 1 print(f' {num_cols}x{num_rows}') # Create an empty array to store the image data print(f'Allocating an array with shape {num_rows * tile_size, num_cols * tile_size} and dimension 3...') image_data = np.empty((num_rows * tile_size, num_cols * tile_size, 3), dtype=np.uint8) def build_tiff_data(task): row, col = task tile_file = tiles_output / f"{row}_{col}.png" if not tile_file.is_file(): raise Exception(f'Tile does not exist: {tile_file}') with Image.open(tile_file) as img: tile_data = np.array(img) # Remove the alpha channel tile_data = tile_data[:, :, :3] # Replace white pixels with NODATA tile_data[np.all(tile_data == [255, 255, 255], axis=-1)] = [0, 0, 0] # ArcGIS does not like pixels that have zeros in them, eg. (255, 0, 0). We need to convert the zeros to ones, eg. (255, 1, 1). mask = np.any(tile_data == 0, axis=-1) & np.any(tile_data != 0, axis=-1) # Identify pixels where not all bands are zero and at least one band is zero. for i in range(3): # Iterate over each band. # For these pixels, set zero bands to one. tile_data[mask & (tile_data[:, :, i] == 0), i] = 1 # Calculate the position of the tile in the image data array. row_pos = (row - min_row) * tile_size col_pos = (col - min_col) * tile_size # Insert the tile data into the image data array at the correct spot. image_data[row_pos:row_pos + tile_size, col_pos:col_pos + tile_size] = tile_data with ThreadPoolExecutor(max_workers=args.tiff_threads) as executor: futures = {executor.submit(build_tiff_data, task) for task in tiles} bar = tqdm(total=len(futures), desc='Building TIFF', postfix='There may be a lengthy startup time, please be patient!') for future in as_completed(futures): bar.set_postfix() bar.update() bar.close() # Transpose the image data array to the format (bands, rows, cols). print('Transposing...') image_data = np.transpose(image_data, (2, 0, 1)) # Define the transformation from pixel coordinates to geographic coordinates, which is an Affine transformation that # maps pixel coordinates in the image to geographic coordinates on the Earth's surface. print('Calculating transform...') transform = (Affine.translation(top_left_lon, top_left_lat) # Create a translation transformation that shifts the image and set the origin of the image to the top-left corner of the bounding box. # Create a scaling transformation that scales the image in the x and y directions to convert the pixel coordinates of the image to the geographic coordinates of the bounding box. * Affine.scale((bottom_right_lon - top_left_lon) / image_data.shape[2], (bottom_right_lat - top_left_lat) / image_data.shape[1])) # Write the image data to a GeoTIFF file print('Writing TIFF to:', output_tiff) start_write_tiff = time.time() with rasterio.open(output_tiff, "w", driver="GTiff", height=num_rows * tile_size, width=num_cols * tile_size, count=3, dtype=str(image_data.dtype), crs='EPSG:4326', transform=transform, compress="DEFLATE", nodata=0) as dst: dst.write(image_data, indexes=[1, 2, 3]) now = time.time() print(f'Time to write TIFF: {convert_seconds(int(now - start_write_tiff))} seconds. Total run time: {convert_seconds(int(now - main_start_time))}')