# # extract-waterfall.R, 17 Jul 22 # # Data from: # https://techtalk.pcmatic.com/research-charts-memory/ # library("colorspace") library("png") # Count points having color col on every horizontal line horizontal_line=function(a_img, col) { lines_col=sapply(1:n_lines, function(X) sum((a_img[X, , 1]==col[1]) & (a_img[X, , 2]==col[2]) & (a_img[X, , 3]==col[3])) ) return(lines_col) } # Count points having color col on every vertical line vertical_line=function(a_img, col) { lines_col=sapply(1:n_cols, function(X) sum((a_img[ , X, 1]==col[1]) & (a_img[ , X, 2]==col[2]) & (a_img[ , X, 3]==col[3])) ) return(lines_col) } # Return y position of vertical color changes at x_pos y_col_change=function(x_pos) { # A good enough technique for generating a unique value per RGB color col_change=which(!duplicated(img[y_range, x_pos, 1]+ 10*img[y_range, x_pos, 2]+ 100*img[y_range, x_pos, 3])) # There is a 1-pixel separation line between colors, which creates a run of 2 or 3 consecutive numbers. # Diff is used to find these consecutive sequences. y_change=c(1, col_change[which(diff(col_change) > 1)+1]) # Always return a vector containing max_vals elements (or d)... return(c(y_change, rep(NA, max_vals-length(y_change)))) } # Image read in row/col order, i.e., y/x # With (0, 0) in top left img=readPNG("../rc_mem_memrange_all.php.png") n_lines=dim(img)[1] n_cols=dim(img)[2] max_vals=20 # Needs to be larger than classified values because of separator line color white=c(1, 1, 1) white_horiz=horizontal_line(img, white) ylim=c(0, which(abs(diff(white_horiz/n_cols)) > 0.5)) # handle when upper boundary is missing ylim=ylim[2:3] y_range=ylim[1]:ylim[2] plot(100*white_horiz/n_cols, xaxs="i", yaxs="i", xlim=c(0, n_lines), ylim=c(0, 100), xlab="Line (horizontal)", ylab="Percent white") white_vert=vertical_line(img, white) # upper left is (0, 0) xlim=c(which(abs(diff(white_vert/n_lines)) > 0.5), n_cols) # handle case when right boundary is missing xlim=xlim[1:2] x_max=xlim[2]-xlim[1]+1 mid_x=trunc(mean(xlim)) plot(100*white_vert/n_lines, xaxs="i", yaxs="i", xlim=c(0, n_cols), ylim=c(0, 100), xlab="Line (vertical)", ylab="Percent white") # line_col=img[y_range, mid_x, 1] change_pts=sapply(xlim[1]:xlim[2], y_col_change) # head(sort(table(change_pts), decreasing=TRUE), n=20) # Removes edges change_pts[change_pts == 1]=NA change_pts[change_pts == (ylim[2]-ylim[1]+1)]=NA # Create a dataframe of x/y boundary coordinates boundary_pts=adply(1:n_cols, .margins=1, function(X) adply(1:10, .margins=1, function(Y) data.frame(x=X, img_y=change_pts[Y, X]) ) ) boundary_pts$X1=NULL # Remove side-effect noise boundary_pts=subset(boundary_pts, !is.na(img_y)) # Rotate zero from top left to bottom left boundary_pts$y=ylim[2]-boundary_pts$img_y-ylim[1]+1 # plot(boundary_pts$x, boundary_pts$img_y, # xaxs="i", # xlim=c(1, x_max), # xlab="Date", ylab="Lines") plot(boundary_pts$x, boundary_pts$y, xaxs="i", xlim=c(1, x_max), xlab="Date", ylab="Lines") # Points associated with the same boundary line should all be associated with the same color boundary_pts$col=img[cbind(boundary_pts$img_y+ylim[1]-3, boundary_pts$x+xlim[1], 1)]+ 10*img[cbind(boundary_pts$img_y+ylim[1]-3, boundary_pts$x+xlim[1], 2)]+ 100*img[cbind(boundary_pts$img_y+ylim[1]-3, boundary_pts$x+xlim[1], 3)] col_cnt=count(boundary_pts$col) col_cnt=col_cnt[order(col_cnt$freq, decreasing=TRUE), ] # head(col_cnt, n=10) pal_col=rainbow(9) dum=sapply(1:9, function(X) { h1=subset(boundary_pts, col == col_cnt$x[X]) points(h1$x, h1$y, col=pal_col[X]) } )