Skip to content

RAPIDS

cudf

GPU DataFrame.

cuml

Machine Learning Algorithms.

k-NN
  • find duplicates from image embedding

    import cuml
    
    model = cuml.neighbors.NearestNeighbors(n_neighbors=3)
    model.fit(embed_train)
    distances, indices = model.kneighbors(embed_test)
    
    mm = np.min(distances, axis=1)
    
    idx = np.where((mm < 2))[0]
    
k-Means
import cuml

model = cuml.KMeans(n_clusters=20)
model.fit(embed)
train['cluster'] = model.labels_
train.head()
T-SNE
import cuml

# model
model = cuml.TSNE()
embed2D = model.fit_transform(embed)
train['x'] = embed2D[:,0]
train['y'] = embed2D[:,1]

# get region with largest MM rate
X_DIV = 10; Y_DIV = 10
x_min = train.x.min()
x_max = train.x.max()
y_min = train.y.min()
y_max = train.y.max()
x_step = (x_max - x_min)/X_DIV
y_step = (y_max - y_min)/Y_DIV
mx = 0; xa_mx = 0; xb_mx=0; ya_mx = 0; yb_mx = 0
for k in range(X_DIV+1):
    for j in range(Y_DIV+1):
        xa = k*x_step + x_min
        xb = (k+1)*x_step + x_min
        ya = j*y_step + y_min
        yb = (j+1)*y_step + y_min
        df = train.loc[(train.x>xa)&(train.x<xb)&(train.y>ya)&(train.y<yb)]
        t = df.target.mean()
        if (t>mx)&(len(df)>=16):
            mx = t
            xa_mx = xa
            xb_mx = xb
            ya_mx = ya
            yb_mx = yb
        #print(k,j,t)

# vis
plt.figure(figsize=(10,10))
df1 = train.loc[train.target==0]
plt.scatter(df1.x,df1.y,color='orange',s=10,label='Benign')
df2 = train.loc[train.target==1]
plt.scatter(df2.x,df2.y,color='blue',s=10,label='Malignant')
plt.plot([xa_mx,xa_mx],[ya_mx,yb_mx],color='black')
plt.plot([xa_mx,xb_mx],[ya_mx,ya_mx],color='black')
plt.plot([xb_mx,xb_mx],[ya_mx,yb_mx],color='black')
plt.plot([xa_mx,xb_mx],[yb_mx,yb_mx],color='black')
plt.legend()
plt.show()