728x90

2023-09-20 18th Class

❤️ 배운 것

머신러닝 관련 수학 공식 파이썬 알고리즘으로 구현하기 - 2
(1차원 list 편)

  • mean subtraction
def e147():  
    # mean subtraction  
    math_scores = [40, 60, 80]  
    english_scores = [30, 40, 50]  

    n_class = 2  
    n_student = len(math_scores)  

    score_sums = list()  
    score_means = list()  

    for _ in range(n_class):  
        score_sums.append(0)  

    for student_idx in range(n_student):  
        score_sums[0] += math_scores[student_idx]  
        score_sums[1] += english_scores[student_idx]  

    print("sums of scores: ", score_sums)  

    for class_idx in range(n_class):  
        class_mean = score_sums[class_idx] / n_student  
        score_means.append(class_mean)  

    print("means of scores: ", score_means)  

    for student_idx in range(n_student):  
        math_scores[student_idx] -= score_means[0]  
        english_scores[student_idx] -= score_means[1]  

    print("Math scores after mean subtraction: ", math_scores)  
    print("English scores after mean subtraction: ", english_scores)  

    tmp = 0  
    for i in math_scores:  
        tmp += i  

    print(f"{tmp = }, {n_student = }")  
    print(f"{tmp / n_student = }")
sums of scores:  [180, 120]
means of scores:  [60.0, 40.0]
Math scores after mean subtraction:  [-20.0, 0.0, 20.0]
English scores after mean subtraction:  [-10.0, 0.0, 10.0]
tmp = 0.0, n_student = 3
tmp / n_student = 0.0
  • 분산, 표분편차
def e150():  
    # 평균과 표준 분산편차  
    scores = [10, 20, 30]  
    n_student = len(scores)  
    score_sum, score_square_sum = 0, 0  

    for score in scores:  
        score_sum += score  
        score_square_sum += score ** 2  

    mean = score_sum / n_student  
    mean_of_square = score_square_sum / n_student  
    square_of_mean = mean ** 2  

    # mos - som  
    variance = mean_of_square - square_of_mean  
    std = variance ** 0.5  

    mean = score_sum / n_student  

    for student_idx in range(n_student):  
        scores[student_idx] = (scores[student_idx] - mean) / std  

    print(scores)  

    mean = (scores[0] + scores[1] + scores[2]) / n_student  

    score_sum = 0  
    score_square_sum = 0  

    for score in scores:  
        score_sum += score  
        score_square_sum += score **2  

    mean_of_square = score_square_sum / n_student  
    square_of_mean = mean ** 2  

    variance = mean_of_square - square_of_mean  
    std = variance ** 0.5  

    print('mean: ', mean)  
    print('standard deviation: ', std)  


def e151():  
    # 평균, 분산과 표준편차  
    math_scores, english_scores = [50, 60, 70], [30, 40, 50]  
    n_student = len(math_scores)  

    math_sum, english_sum = 0, 0  
    math_square_sum, english_square_sum = 0, 0  

    for student_idx in range(n_student):  
        math_sum += math_scores[student_idx]  
        math_square_sum += math_scores[student_idx]**2  

        english_sum += english_scores[student_idx]  
        english_square_sum += english_scores[student_idx]**2  

    math_mean = math_sum / n_student  
    english_mean = english_sum / n_student  

    # mean of square - square of mean  
    math_variance = math_square_sum / n_student - math_mean**2  
    english_variance = english_square_sum / n_student - english_mean**2  

    math_std = math_variance**0.5  
    english_std = english_variance**0.5  

    print("mean/std of Math: ", math_mean, math_std)  
    print("mean/std of English: ", english_mean, english_std)
==============================
variancee:  66.66666666666669
standard deviation:  8.16496580927726
==============================
[-1.224744871391589, 0.0, 1.224744871391589]
mean:  0.0
standard deviation:  1.0
==============================
mean/std of Math:  60.0 8.164965809277252
mean/std of English:  40.0 8.164965809277264
==============================
  • standardize
def get_sum_square_mean(val):  
    my_sum = 0  
    my_sum_square = 0  
    for i in val:  
        my_sum += i  
        my_sum_square += i**2  

    return my_sum, my_sum_square, (my_sum/len(val))  


def standardize(val):  
    my_sum, my_sum_square, my_mean = get_sum_square_mean(val)  

    mean_of_square = my_sum_square / len(val)  
    square_of_mean = my_mean**2  
    variance = mean_of_square - square_of_mean  
    std = variance**0.5  

    new = [(i - my_mean)/std for i in val]  

    my_sum, my_sum_square, my_mean = get_sum_square_mean(new)  
    mean_of_square = my_sum_square / len(val)  
    square_of_mean = my_mean ** 2  
    variance = mean_of_square - square_of_mean  
    std = variance ** 0.5  

    return new, my_mean, std  


def e152():  
    # standardization  
    math_scores, english_scores = [50, 60, 70], [30, 40, 50]  
    n_student = len(math_scores)  
    math_sum, math_square, math_mean = get_sum_square_mean(math_scores)  
    englsih_sum, english_square, english_mean = get_sum_square_mean(english_scores)  

    for student_idx in range(n_student):  
        math_scores[student_idx] = (math_scores[student_idx] - math_mean)  
        english_scores[student_idx] = (english_scores[student_idx] - english_mean)  

    # standardization  
    math_scores, math_mean, math_std = standardize(math_scores)  
    english_scores, english_mean, english_std = standardize(english_scores)  

    print("Math scores after standardization: ", math_scores)  
    print("English scores after standardization: ", english_scores)  

    print("mean/std of Math: ", math_mean, math_std)  
    print("mean/std of English: ", english_mean, english_std)
Math scores after standardization:  [-1.224744871391589, 0.0, 1.224744871391589]
English scores after standardization:  [-1.224744871391589, 0.0, 1.224744871391589]
mean/std of Math:  0.0 1.0
mean/std of English:  0.0 1.0
  • hadamard product
def e153():  
    # hadamard product 원소별 곱셈  
    v1 = [1, 2, 3, 4, 5]  
    v2 = [10, 20, 30, 40, 50]  

    # method 1  
    v3 = list()  
    for dim_idx in range(len(v1)):  
        v3.append(v1[dim_idx] * v2[dim_idx])  

    print(v3)  

    # method 2  
    v3 = list()  
    for _ in range(len(v1)):  
        v3.append(0)  

    for dim_idx in range(len(v1)):  
        v3[dim_idx] = v1[dim_idx] * v2[dim_idx]  

    print(v3)
[10, 40, 90, 160, 250]
[10, 40, 90, 160, 250]
  • norm & unit(1) 길이로 변경
def e154(v1=None):  
    # get norm  
    if v1 is None:  
        v1 = [1, 2, 3]  

    square_sum = 0  
    for dim_val in v1:  
        square_sum += dim_val**2  
    norm = square_sum**0.5  
    print("in method norm of v1: ", norm)  
    return norm  


def e155():  
    # unit(1) 기준 norm 변경  
    v1 = [1, 2, 3]  
    norm = e154(v1)  
    print("initial norm: ", norm)  

    for dim_idx in range(len(v1)):  
        v1[dim_idx] /= norm  

    norm = e154(v1)  
    print("new norm of v1: ", norm)
==============================
in method norm of v1:  3.7416573867739413
==============================
in method norm of v1:  3.7416573867739413
initial norm:  3.7416573867739413
in method norm of v1:  1.0
new norm of v1:  1.0
==============================
  • euclidean distance
def e157():  
    # euclidean distance  
    v1, v2 = [1, 2, 3], [3, 4, 5]  

    diff_square_sum = 0  
    for dim_idx in range(len(v1)):  
        diff_square_sum += (v1[dim_idx] - v2[dim_idx])**2  

    e_distance = diff_square_sum**0.5  
    print("euclidean distance between v1 and v2: ", e_distance)
euclidean distance between v1 and v2:  3.4641016151377544
  • MSE
def e158():  
    # MSE  
    predictions = [10, 20, 30]  
    labels = [10, 25, 40]  

    n_data = len(predictions)  
    diff_square_sum = 0  

    for data_idx in range(n_data):  
        diff_square_sum += (predictions[data_idx] - labels[data_idx])**2  

    mse = diff_square_sum / n_data  
    print("MSE: ", mse)
MSE:  41.666666666666664

if, for, list 연습

def e159():  
    # 0 ~ 4 까지 숫자 개수 카운트  
    numbers = [0, 2, 4, 2, 1, 4, 3, 1, 2, 3, 4, 1, 2, 3, 4]  
    number_cnt = [0, 0, 0, 0, 0]  

    for num in numbers:  
        number_cnt[num] += 1  

    print(number_cnt)  


def e160():  
    # if 문  
    score = 60  
    if score > 50:  
        print("pass")  


def e161():  
    # if else 문  
    score = 40  
    cutoff = 50  

    if score > cutoff:  
        print("pass")  
    else:  
        print("try again")  


def e162():  
    # if문을 사용하여 초를 분-초 단위로 변환  
    seconds = 200  

    if seconds >= 60:  
        minutes = seconds // 60  
        seconds -= minutes*60  

    else:  
        minutes = 0  

    print(minutes, "min", seconds, "sec")  


def e163(seconds):  
    # if문을 사용하여 초를 시간-분-초로 변환  
    if seconds is None:  
        seconds = 5000  

    ori_sec = seconds  

    if seconds >= 60*60:  
        hours = seconds // (60*60)  
        seconds -= hours * (60*60)  
        minutes = seconds // 60  
        seconds -= minutes * 60  
        print(f"{ori_sec} seconds is ", hours, "hours", minutes, "min", seconds, "sec")  

    elif seconds >= 60:  
        minutes = seconds // 60  
        seconds -= minutes * 60  
        print(f"{ori_sec} seconds is ", minutes, "min", seconds, "sec")  

    else:  
        print(f"{ori_sec} seconds is ", seconds, "seconds")  


def e164():  
    # if문을 사용하여 짝수/홀수 구분  
    number = 10  
    if number % 2 == 0:  
        print('even')  

    else:  
        print("odd")  


def e165():  
    # if문을 사용하여 대소 비교  
    num1, num2 = 10, 10  

    if num1 > num2:  
        print("first number")  

    elif num1 == num2:  
        print("equal")  

    else:  
        print("second number")  


def e169():  
    # 통과와 낙제 점수 취득자의 평균 구하기  
    scores = [20, 50, 10, 60, 90]  
    cutoff = 50  

    p_score_sum, n_p = 0, 0  
    np_score_sum, n_np = 0, 0  

    for score in scores:  
        if score > cutoff:  
            p_score_sum += score  
            n_p += 1  
        else:  
            np_score_sum += score  
            n_np += 1  

    p_score_mean = p_score_sum / n_p  
    np_score_mean = np_score_sum / n_np  

    print("mean of passed scores: ", p_score_mean)  
    print("mean of no passed scores: ", np_score_mean)  


def e170():  
    # 리스트 값의 짝 홀수 및 정수 여부 구하기  
    numbers = list()  

    for num in range(10):  
        numbers.append(num)  

    numbers.append(3.14)  
    print(numbers)  

    for num in numbers:  
        if num % 2 == 0:  
            print('even number')  
        elif num % 2 == 1:  
            print("odd number")  
        else:  
            print('not an integer')  


def e171():  
    multiple_of = 3  

    numbers = list()  
    for num in range(100):  
        numbers.append(num)  


    sum_multiple_of_n = 0  
    for num in numbers:  
        if num % multiple_of == 0:  
            sum_multiple_of_n += num  

    print(sum_multiple_of_n)
[1, 3, 4, 3, 4]
==============================
pass
==============================
try again
==============================
3 min 20 sec
==============================
5000 seconds is  1 hours 23 min 20 sec
==============================
200 seconds is  3 min 20 sec
==============================
45 seconds is  45 seconds
==============================
even
==============================
equal
==============================
mean of passed scores:  75.0
mean of no passed scores:  26.666666666666668
==============================
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3.14]
even number
odd number
even number
odd number
even number
odd number
even number
odd number
even number
odd number
not an integer
==============================
1683

if문, for문 활용하여 최댓값, 최솟값 구하기

  • min max normalization
def e172():  
    # 최댓값, 최솟값 구하기  
    scores = [60, 40, 70, 20, 30]  
    M, m = 0, 100  

    for score in scores:  
        if score > M:  
            M = score  

        if score < m:  
            m = score  

    print("Max value: ", M)  
    print("Min value: ", m)  


def e173():  
    # 최댓값 최솟값 구하기  
    scores = [-20, 60, 40, 70, 120]  

    # method 1  
    M, m = scores[0], scores[0]  
    for score in scores:  
        if score > M:  
            M = score  
        if score < m:  
            m = score  

    print("Max value: ", M)  
    print("min value: ", m)  

    # method 2  
    M, m = None, None  

    for score in scores:  
        if M == None or score > M:  
            M = score  
        if m == None or score < m:  
            m = score  

    print("Max value: ", M)  
    print("min value: ", m)
Max value:  70
Min value:  20
==============================
Max value:  120
min value:  -20
Max value:  120
min value:  -20
  • MinMaxNormalization
def e174():  
    # min max normalization  
    scores = [-20, 60, 40, 70, 120]  
    # 최댓값, 최솟값  
    # method 1  
    M, m = scores[0], scores[0]  
    for score in scores:  
        if score > M:  
            M = score  
        if score < m:  
            m = score  

    print("Max value: ", M)  
    print("Min value: ", m)  

    for score_idx in range(len(scores)):  
        scores[score_idx] = (scores[score_idx] - m) / (M-m)  

    print("scores after normalization: \n", scores)  

    # method 1  
    M, m = scores[0], scores[0]  
    for score in scores:  
        if score > M:  
            M = score  
        if score < m:  
            m = score  

    print("Max value: ", M)  
    print("Min value: ", m)
Max value:  120
Min value:  -20
scores after normalization: 
 [0.0, 0.5714285714285714, 0.42857142857142855, 0.6428571428571429, 1.0]
Max value:  1.0
Min value:  0.0
def e175():  
    scores = [60, -20, 40, 120, 70]  
    M, m = None, None  
    M_idx, m_idx = 0, 0  

    for score_idx in range(len(scores)):  
        score = scores[score_idx]  

        if M == None or score > M:  
            M = score  
            M_idx = score_idx  

        if m == None or score < m:  
            m = score  
            m_idx = score_idx  

    print("M/M_idx: ", M, M_idx)  
    print("m/m_idx: ", m, m_idx)
M/M_idx:  120 3
m/m_idx:  -20 1
  • list 원소 sort (DESC)
def e176():  
    # sort (desc)  
    scores = [40, 20, 30, 10, 50]  
    sorted_scores = list()  

    for _ in range(len(scores)):  
        M, M_idx = scores[0], 0  

        for score_idx in range(len(scores)):  
            if scores[score_idx] > M:  
                M = scores[score_idx]  
                M_idx = score_idx  

        tmp_scores = list()  
        for score_idx in range(len(scores)):  
            if score_idx == M_idx:  
                sorted_scores.append(scores[score_idx])  

            else:  
                tmp_scores.append(scores[score_idx])  

        scores = tmp_scores  

        print("remaining scores: ", scores)  
        print("sorted scores: ", sorted_scores, '\n')
remaining scores:  [40, 20, 30, 10]
sorted scores:  [50] 

remaining scores:  [20, 30, 10]
sorted scores:  [50, 40] 

remaining scores:  [20, 10]
sorted scores:  [50, 40, 30] 

remaining scores:  [10]
sorted scores:  [50, 40, 30, 20] 

remaining scores:  []
sorted scores:  [50, 40, 30, 20, 10] 

자습

  • scikit-learn 데이터 pandas 처리, 결측치 처리
  • 파이프라인
  • 그리드서치

💛 배운점/느낀점

  • scaling 종류가 min max, standard, robust 등 다양하게 있는데 목적에 따라 사용한다는 것을 알게 되었다.
  • trainset에서 x값은 scaling을하고, y값은 그대로 사용하는데,
    독립(x)는 여러개가 존재해서 scaling을 하고 종속(y)는 고정으로 사용한다는 것을 알게 되었다.
    feature가 많을 수록 모형이 복잡해지고 차원의 저주에 빠진다는 것이 통계와 관련있었다...
반응형