#
# extract-waterfall.R, 17 Jul 22
#
# Data from:
# https://techtalk.pcmatic.com/research-charts-memory/
#

library("colorspace")
library("png")


# Count points having color col on every horizontal line
horizontal_line=function(a_img, col)
{
lines_col=sapply(1:n_lines, function(X) sum((a_img[X, , 1]==col[1]) &
						(a_img[X, , 2]==col[2]) &
						(a_img[X, , 3]==col[3]))
		)

return(lines_col)
}


# Count points having color col on every vertical line
vertical_line=function(a_img, col)
{
lines_col=sapply(1:n_cols, function(X) sum((a_img[ , X, 1]==col[1]) &
						(a_img[ , X, 2]==col[2]) &
						(a_img[ , X, 3]==col[3]))
		)

return(lines_col)
}


# Return y position of vertical color changes at x_pos
y_col_change=function(x_pos)
{
# A good enough technique for generating a unique value per RGB color
col_change=which(!duplicated(img[y_range, x_pos, 1]+
						10*img[y_range, x_pos, 2]+
						100*img[y_range, x_pos, 3]))

# There is a 1-pixel separation line between colors, which creates a run of 2 or 3 consecutive numbers.
# Diff is used to find these consecutive sequences.
y_change=c(1, col_change[which(diff(col_change) > 1)+1])

# Always return a vector containing max_vals elements (or d)...
return(c(y_change, rep(NA, max_vals-length(y_change))))
}


# Image read in row/col order, i.e., y/x
# With (0, 0) in top left
img=readPNG("../rc_mem_memrange_all.php.png")

n_lines=dim(img)[1]
n_cols=dim(img)[2]

max_vals=20 # Needs to be larger than classified values because of separator line color

white=c(1, 1, 1)

white_horiz=horizontal_line(img, white)

ylim=c(0, which(abs(diff(white_horiz/n_cols)) > 0.5)) # handle when upper boundary is missing
ylim=ylim[2:3]
y_range=ylim[1]:ylim[2]

plot(100*white_horiz/n_cols,
	xaxs="i", yaxs="i",
	xlim=c(0, n_lines), ylim=c(0, 100),
	xlab="Line (horizontal)", ylab="Percent white")

white_vert=vertical_line(img, white)

# upper left is (0, 0)
xlim=c(which(abs(diff(white_vert/n_lines)) > 0.5), n_cols) # handle case when right boundary is missing
xlim=xlim[1:2]
x_max=xlim[2]-xlim[1]+1
mid_x=trunc(mean(xlim))

plot(100*white_vert/n_lines,
	xaxs="i", yaxs="i",
	xlim=c(0, n_cols), ylim=c(0, 100),
	xlab="Line (vertical)", ylab="Percent white")

# line_col=img[y_range, mid_x, 1]

change_pts=sapply(xlim[1]:xlim[2], y_col_change)

# head(sort(table(change_pts), decreasing=TRUE), n=20)

# Removes edges
change_pts[change_pts == 1]=NA
change_pts[change_pts == (ylim[2]-ylim[1]+1)]=NA

# Create a dataframe of x/y boundary coordinates
boundary_pts=adply(1:n_cols, .margins=1,
			function(X)
			adply(1:10, .margins=1,
				function(Y)
				data.frame(x=X, img_y=change_pts[Y, X])
			)
		)
boundary_pts$X1=NULL # Remove side-effect noise
boundary_pts=subset(boundary_pts, !is.na(img_y))
# Rotate zero from top left to bottom left
boundary_pts$y=ylim[2]-boundary_pts$img_y-ylim[1]+1

# plot(boundary_pts$x, boundary_pts$img_y,
# 	xaxs="i",
# 	xlim=c(1, x_max),
# 	xlab="Date", ylab="Lines")

plot(boundary_pts$x, boundary_pts$y,
	xaxs="i",
	xlim=c(1, x_max),
	xlab="Date", ylab="Lines")

# Points associated with the same boundary line should all be associated with the same color

boundary_pts$col=img[cbind(boundary_pts$img_y+ylim[1]-3, boundary_pts$x+xlim[1], 1)]+
			10*img[cbind(boundary_pts$img_y+ylim[1]-3, boundary_pts$x+xlim[1], 2)]+
			100*img[cbind(boundary_pts$img_y+ylim[1]-3, boundary_pts$x+xlim[1], 3)]

col_cnt=count(boundary_pts$col)
col_cnt=col_cnt[order(col_cnt$freq, decreasing=TRUE), ]

# head(col_cnt, n=10)

pal_col=rainbow(9)

dum=sapply(1:9, function(X)
		{
		h1=subset(boundary_pts, col == col_cnt$x[X])
		points(h1$x, h1$y, col=pal_col[X])
		}
		)