2023-11-02 23:35:43 -06:00
import argparse
import base64
2023-11-10 18:00:10 -07:00
import os
2023-11-02 23:35:43 -06:00
import time
from concurrent . futures import ThreadPoolExecutor , as_completed
from pathlib import Path
import numpy as np
import rasterio
from PIL import Image
from rasterio import Affine
from tqdm import tqdm
2023-11-10 23:41:01 -07:00
from pkg . helpers import convert_seconds
2023-11-02 23:35:43 -06:00
from pkg . image import random_file_width
2023-11-06 19:06:06 -07:00
from pkg . spatial import deg2num
2023-11-02 23:35:43 -06:00
from pkg . thread import download_tile
if __name__ == ' __main__ ' :
2023-11-10 23:41:01 -07:00
main_start_time = time . time ( )
2023-11-02 23:35:43 -06:00
parser = argparse . ArgumentParser ( description = ' Exfiltrate data from WMS servers. ' )
parser . add_argument ( ' base_url ' , help = ' The base URL for the WMS server. Example: https://wmts.nlsc.gov.tw/wmts/nURBAN/default/EPSG:3857/ ' )
parser . add_argument ( ' --zoom ' , type = int , required = True , help = ' The zoom level to use. ' )
2023-11-11 11:39:51 -07:00
parser . add_argument ( ' --bbox ' , required = True , type = str , metavar = ' Bounding Box ' , nargs = ' + ' , default = ( None , None , None , None ) , help = ' Bounding Box of the area to download. Separate each value with a space. (top left lat, top left lon, bottom right lat, bottom right lon) ' )
2023-11-06 19:57:57 -07:00
parser . add_argument ( ' --dl-threads ' , type = int , default = 10 , help = ' Number of download threads to use. ' )
2023-11-02 23:35:43 -06:00
parser . add_argument ( ' --referer ' , help = ' The content of the Referer header to send. ' )
parser . add_argument ( ' --output ' , default = ' wmts-output ' , help = ' Output directory path. ' )
2023-11-11 11:39:51 -07:00
parser . add_argument ( ' --proxy ' , action = ' store_true ' , help = ' Enable using a proxy. You must modify pkg/proxies.py ' )
2023-11-10 18:00:10 -07:00
parser . add_argument ( ' --tiff-threads ' , default = None , type = int , help = f ' Number of threads to use when building TIFF. Default: { min ( 32 , ( os . cpu_count ( ) or 1 ) + 4 ) } ' ) # https://github.com/python/cpython/blob/3.10/Lib/concurrent/futures/thread.py#L142C27-L142C61
2023-11-02 23:35:43 -06:00
parser . add_argument ( ' --output-tiff ' , help = ' Path for output GeoTIFF. Default: wmts-output/output.tiff ' )
2023-11-06 18:55:43 -07:00
parser . add_argument ( ' --no-download ' , action = ' store_true ' , help = " Don ' t do any downloading or image checking. " )
2023-11-11 10:55:43 -07:00
parser . add_argument ( ' --download-loops ' , default = 1 , type = int , help = ' Sometimes the tiles are downloaded incorrectly. Re-running the download process can fix these corrupted tiles. This arg specifies how many times to run the download process. Default: 1 ' )
2023-11-11 11:39:51 -07:00
parser . add_argument ( ' --convert ' , action = ' store_true ' , help = " Convert tiles to PNG. Sometimes the server will return transparent GIFs when there are blank tiles so this will convert them to a PNG. " )
2023-11-02 23:35:43 -06:00
args = parser . parse_args ( )
2023-11-11 10:55:43 -07:00
if args . download_loops < = 0 :
print ( ' --download-loops must be greater than 0 ' )
quit ( 1 )
2023-11-02 23:35:43 -06:00
args . base_url = args . base_url . strip ( ' / ' ) + f ' / { args . zoom } / '
base_output = Path ( args . output ) . resolve ( ) . absolute ( ) . expanduser ( )
url_hash = base64 . b64encode ( args . base_url . encode ( ) ) . decode ( ' utf-8 ' ) . strip ( ' == ' )
tiles_output = base_output / url_hash / str ( args . zoom )
tiles_output . mkdir ( parents = True , exist_ok = True )
top_left_lat , top_left_lon , bottom_right_lat , bottom_right_lon = map ( float , args . bbox )
min_col , min_row = deg2num ( top_left_lat , top_left_lon , args . zoom )
max_col , max_row = deg2num ( bottom_right_lat , bottom_right_lon , args . zoom )
if args . output_tiff :
output_tiff = Path ( args . output_tiff )
else :
output_tiff = base_output / f ' output-z { args . zoom } - { top_left_lat } x { top_left_lon } - { bottom_right_lat } x { bottom_right_lon } .tiff '
r_headers = {
' User-Agent ' : ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 ' ,
' Accept ' : ' image/avif,image/webp,*/* ' ,
' Accept-Language ' : ' en-US,en;q=0.5 '
}
if args . referer :
r_headers [ ' Referer ' ] = args . referer
2023-11-11 11:30:41 -07:00
retry_files = set ( )
tiles = set ( )
2023-11-11 14:01:12 -07:00
total_new_files = 0
total_fixed_files = 0
2023-11-11 11:30:41 -07:00
row_bar = tqdm ( total = 0 , desc = ' Row 000 | Loop 0/0 ' , postfix = { ' new_files ' : 0 , ' failures ' : 0 , ' fixed ' : 0 } )
2023-11-11 10:55:43 -07:00
for i in range ( 1 , args . download_loops + 1 ) :
row_bar . reset ( )
2023-11-11 11:34:42 -07:00
converted_files = 0
2023-11-11 11:30:41 -07:00
fixed_files = 0
2023-11-11 14:01:12 -07:00
new_files = 0
2023-11-11 10:55:43 -07:00
row_i = min_row
row_iter = range ( min_row , max_row + 1 )
row_bar . total = len ( row_iter )
row_bar . desc = f ' Row { row_i } | Loop { i } / { args . download_loops } '
2023-11-11 11:34:42 -07:00
def update_bar_postfix ( ) :
2023-11-11 14:01:12 -07:00
row_bar . set_postfix ( { ' new ' : new_files , ' failures ' : len ( retry_files ) , ' fixed ' : fixed_files , ' converted ' : converted_files } )
2023-11-11 11:34:42 -07:00
update_bar_postfix ( )
2023-11-11 10:55:43 -07:00
for row in row_iter :
row_i = row
col_iter = range ( min_col , max_col + 1 )
if args . no_download :
for col in col_iter :
2023-11-11 11:30:41 -07:00
tiles . add ( ( row , col ) )
2023-11-11 10:55:43 -07:00
else :
col_bar = tqdm ( total = len ( col_iter ) , leave = False )
2023-11-11 11:22:00 -07:00
# On the final download loop (if we are doing multiple), don't convert any files.
do_convert = False if args . convert and 1 < args . download_loops == i else args . convert
2023-11-11 10:55:43 -07:00
with ( ThreadPoolExecutor ( args . dl_threads ) as executor ) :
2023-11-11 11:22:00 -07:00
futures = [ executor . submit ( download_tile , ( row , col , args . base_url , r_headers , tiles_output , args . proxy , do_convert ) ) for col in col_iter ]
2023-11-11 10:55:43 -07:00
for future in as_completed ( futures ) :
result = future . result ( )
if result :
result_row , result_col , new_image = result
if new_image == ' success ' :
2023-11-11 14:01:12 -07:00
new_files + = 1
total_new_files + = 1
2023-11-11 11:30:41 -07:00
tiles . add ( ( result_row , result_col ) )
2023-11-11 10:55:43 -07:00
elif new_image == ' exist ' :
2023-11-11 11:30:41 -07:00
tiles . add ( ( result_row , result_col ) )
2023-11-11 10:55:43 -07:00
elif new_image == ' failure ' :
2023-11-11 11:30:41 -07:00
retry_files . add ( ( result_row , result_col ) )
elif new_image == ' fixed ' :
tiles . add ( ( result_row , result_col ) )
fixed_files + = 1
2023-11-11 14:01:12 -07:00
total_fixed_files + = 1
2023-11-11 11:34:42 -07:00
elif new_image == ' converted ' :
tiles . add ( ( result_row , result_col ) )
converted_files + = 1
2023-11-11 10:55:43 -07:00
col_bar . update ( )
row_bar . refresh ( )
col_bar . close ( )
2023-11-11 11:34:42 -07:00
update_bar_postfix ( )
2023-11-11 10:55:43 -07:00
row_bar . update ( )
2023-11-11 14:01:12 -07:00
if total_new_files == 0 and total_fixed_files == 0 :
break
2023-11-03 16:31:53 -06:00
row_bar . close ( )
2023-11-11 14:01:12 -07:00
if total_new_files == 0 and total_fixed_files == 0 :
print ( ' All files downloaded, exiting download loop. ' )
2023-11-11 11:30:41 -07:00
col_bar = tqdm ( total = len ( retry_files ) , desc = f ' Tile Retries ' )
2023-11-06 19:57:09 -07:00
with ThreadPoolExecutor ( args . dl_threads ) as executor :
2023-11-11 11:30:41 -07:00
futures = [ executor . submit ( download_tile , ( row , col , args . base_url , r_headers , tiles_output , args . proxy ) ) for row , col in retry_files ]
2023-11-03 16:31:53 -06:00
for future in as_completed ( futures ) :
result = future . result ( )
if result :
result_row , result_col , new_image = result
2023-11-11 11:30:41 -07:00
tiles . add ( ( result_row , result_col ) )
2023-11-03 16:31:53 -06:00
if new_image == ' success ' :
2023-11-11 14:01:12 -07:00
new_files + = 1
2023-11-03 16:31:53 -06:00
elif new_image == ' failure ' :
col_bar . write ( f ' { ( result_row , result_col ) } failed! ' )
col_bar . update ( )
2023-11-05 20:23:29 -07:00
col_bar . close ( )
2023-11-02 23:35:43 -06:00
2023-11-11 14:01:12 -07:00
print ( f ' Downloaded { new_files } images. ' )
2023-11-02 23:35:43 -06:00
2023-11-06 20:24:52 -07:00
print ( ' Determining tile width... ' , end = ' ' )
2023-11-02 23:35:43 -06:00
tile_size = random_file_width ( tiles_output )
2023-11-06 20:24:52 -07:00
print ( f ' { tile_size } px ' )
2023-11-02 23:35:43 -06:00
# Define the number of rows and columns based on the bounding box
2023-11-06 20:24:52 -07:00
print ( ' Calculating maximum columns and rows... ' , end = ' ' )
2023-11-02 23:35:43 -06:00
num_rows = max_row - min_row + 1
num_cols = max_col - min_col + 1
2023-11-06 20:24:52 -07:00
print ( f ' { num_cols } x { num_rows } ' )
2023-11-02 23:35:43 -06:00
# Create an empty array to store the image data
2023-11-06 20:40:07 -07:00
print ( f ' Allocating an array with shape { num_rows * tile_size , num_cols * tile_size } and dimension 3... ' )
2023-11-02 23:35:43 -06:00
image_data = np . empty ( ( num_rows * tile_size , num_cols * tile_size , 3 ) , dtype = np . uint8 )
def build_tiff_data ( task ) :
row , col = task
tile_file = tiles_output / f " { row } _ { col } .png "
if not tile_file . is_file ( ) :
raise Exception ( f ' Tile does not exist: { tile_file } ' )
with Image . open ( tile_file ) as img :
tile_data = np . array ( img )
# Remove the alpha channel
tile_data = tile_data [ : , : , : 3 ]
2023-11-03 01:57:45 -06:00
# Replace white pixels with NODATA
tile_data [ np . all ( tile_data == [ 255 , 255 , 255 ] , axis = - 1 ) ] = [ 0 , 0 , 0 ]
2023-11-03 02:06:46 -06:00
# ArcGIS does not like pixels that have zeros in them, eg. (255, 0, 0). We need to convert the zeros to ones, eg. (255, 1, 1).
2023-11-03 01:57:45 -06:00
mask = np . any ( tile_data == 0 , axis = - 1 ) & np . any ( tile_data != 0 , axis = - 1 ) # Identify pixels where not all bands are zero and at least one band is zero.
for i in range ( 3 ) : # Iterate over each band.
# For these pixels, set zero bands to one.
2023-11-12 13:08:12 -07:00
tile_data [ mask & ( tile_data [ : , : , i ] == 0 ) , i ] = 1
2023-11-03 01:57:45 -06:00
# Calculate the position of the tile in the image data array.
2023-11-02 23:35:43 -06:00
row_pos = ( row - min_row ) * tile_size
col_pos = ( col - min_col ) * tile_size
2023-11-03 01:57:45 -06:00
# Insert the tile data into the image data array at the correct spot.
2023-11-02 23:35:43 -06:00
image_data [ row_pos : row_pos + tile_size , col_pos : col_pos + tile_size ] = tile_data
2023-11-05 20:34:45 -07:00
with ThreadPoolExecutor ( max_workers = args . tiff_threads ) as executor :
2023-11-02 23:35:43 -06:00
futures = { executor . submit ( build_tiff_data , task ) for task in tiles }
2023-11-06 20:40:07 -07:00
bar = tqdm ( total = len ( futures ) , desc = ' Building TIFF ' , postfix = ' There may be a lengthy startup time, please be patient! ' )
for future in as_completed ( futures ) :
bar . set_postfix ( )
bar . update ( )
2023-11-10 18:00:10 -07:00
bar . close ( )
2023-11-02 23:35:43 -06:00
2023-11-03 01:57:45 -06:00
# Transpose the image data array to the format (bands, rows, cols).
2023-11-06 20:24:52 -07:00
print ( ' Transposing... ' )
2023-11-02 23:35:43 -06:00
image_data = np . transpose ( image_data , ( 2 , 0 , 1 ) )
2023-11-03 01:57:45 -06:00
# Define the transformation from pixel coordinates to geographic coordinates, which is an Affine transformation that
# maps pixel coordinates in the image to geographic coordinates on the Earth's surface.
2023-11-06 20:24:52 -07:00
print ( ' Calculating transform... ' )
2023-11-03 01:57:45 -06:00
transform = ( Affine . translation ( top_left_lon , top_left_lat ) # Create a translation transformation that shifts the image and set the origin of the image to the top-left corner of the bounding box.
# Create a scaling transformation that scales the image in the x and y directions to convert the pixel coordinates of the image to the geographic coordinates of the bounding box.
* Affine . scale ( ( bottom_right_lon - top_left_lon ) / image_data . shape [ 2 ] , ( bottom_right_lat - top_left_lat ) / image_data . shape [ 1 ] ) )
2023-11-02 23:35:43 -06:00
# Write the image data to a GeoTIFF file
2023-11-10 23:41:01 -07:00
print ( ' Writing TIFF to: ' , output_tiff )
start_write_tiff = time . time ( )
2023-11-03 01:57:45 -06:00
with rasterio . open ( output_tiff , " w " , driver = " GTiff " , height = num_rows * tile_size , width = num_cols * tile_size , count = 3 , dtype = str ( image_data . dtype ) , crs = ' EPSG:4326 ' , transform = transform , compress = " DEFLATE " , nodata = 0 ) as dst :
2023-11-02 23:35:43 -06:00
dst . write ( image_data , indexes = [ 1 , 2 , 3 ] )
2023-11-10 23:41:01 -07:00
now = time . time ( )
print ( f ' Time to write TIFF: { convert_seconds ( int ( now - start_write_tiff ) ) } seconds. Total run time: { convert_seconds ( int ( now - main_start_time ) ) } ' )