不同版本打字机效率统计分析¶

Load Packages and Define Functions¶

import itertools
import warnings
from collections import defaultdict
import numpy as np
import pandas as pd
import scipy.stats
import statsmodels.stats.api as sms
import math 
from scipy import stats
from scipy.stats import ttest_ind
warnings.filterwarnings("ignore")

def split_freq(text):
    '''
    把文学作品中的所有文字（符号）拆分成两个连续字母（符号）为单元的 dataframe 数据集
    '''
    # split data
    data = list(text)
    A = []
    for i, j in zip(data, data[1:]):
        c = i + j
        # print(c)
        A.append(c)
    # A[:5]
    B = []
    for i, j in zip(data[1:], data[2:]):
        c = i + j
        # print(c)
        B.append(c)
    # B[:5]
    all = A + B
    # print(len(A),"+",len(B),"=",len(all))

    # get the freq count for all
    d = defaultdict(int)

    for word in all:
        d[word] += 1

    df = pd.DataFrame(d.items())
    df.set_index(0, inplace=True)

    return (df, len(all))

def get_result(target, df):
    '''
    获得不同打字机（键盘）相邻字母在文字数据中出现的频次统计
    '''
    seq = df.loc[list(target["ABC"]),]
    seq.dropna(axis=0, inplace=True)
    seq = seq.reset_index()
    seq.columns = ["word_ABC", "freq_ABC"]
    data = []
    data.insert(0, {"word_ABC": "sum", "freq_ABC": sum(seq.freq_ABC)})
    seq_all = pd.concat([pd.DataFrame(data), seq], ignore_index=True)
    seq_all = seq_all.sort_values("freq_ABC", ascending=False).reset_index()

    qwerty = df.loc[list(target["QWERTY"]),]
    qwerty.dropna(axis=0, inplace=True)
    qwerty = qwerty.reset_index()
    qwerty.columns = ["word_QWERTY", "freq_QWERTY"]
    data = []
    data.insert(0, {"word_QWERTY": "sum", "freq_QWERTY": sum(qwerty.freq_QWERTY)})
    qwerty_all = pd.concat([pd.DataFrame(data), qwerty], ignore_index=True)
    qwerty_all = qwerty_all.sort_values("freq_QWERTY", ascending=False).reset_index()

    qwerty_m = df.loc[list(target["QWERTY_m"]),]
    qwerty_m.dropna(axis=0, inplace=True)
    qwerty_m = qwerty_m.reset_index()
    qwerty_m.columns = ["word_QWERTY_m", "freq_QWERTY_m"]
    data = []
    data.insert(
        0, {"word_QWERTY_m": "sum", "freq_QWERTY_m": sum(qwerty_m.freq_QWERTY_m)}
    )
    qwerty_m_all = pd.concat([pd.DataFrame(data), qwerty_m], ignore_index=True)
    qwerty_m_all = qwerty_m_all.sort_values("freq_QWERTY_m", ascending=False).reset_index()

    result = pd.concat([seq_all, qwerty_all, qwerty_m_all], axis=1, ignore_index=False)
    result = result[
        [
            "word_ABC",
            "freq_ABC",
            "word_QWERTY",
            "freq_QWERTY",
            "word_QWERTY_m",
            "freq_QWERTY_m",
        ]
    ]
    return result

def get_result_lr(QWERTY_right_left,QWERTY_left_right,ABC_right_left,ABC_left_right,QWERTY_m_right_left,QWERTY_m_left_right, df):
    '''
    获得不同打字机（键盘）者左右交叉字母在文字数据中出现的频次统计
    '''
    seq = df.loc[list(ABC_right_left) + list(ABC_left_right),]
    seq.dropna(axis=0, inplace=True)
    seq = seq.reset_index()
    seq.columns = ["ABC_right_left", "freq_ABC_right_left"]
    data = []
    data.insert(0, {"ABC_right_left": "sum", "freq_ABC_right_left": sum(seq.freq_ABC_right_left)})
    seq_all = pd.concat([pd.DataFrame(data), seq], ignore_index=True)
    seq_all = seq_all.sort_values("freq_ABC_right_left", ascending=False).reset_index()
   

    qwerty = df.loc[list(QWERTY_right_left) + list(QWERTY_left_right),]
    qwerty.dropna(axis=0, inplace=True)
    qwerty = qwerty.reset_index()
    qwerty.columns = ["QWERTY_right_left", "freq_QWERTY_right_left"]
    data = []
    data.insert(0, {"QWERTY_right_left": "sum", "freq_QWERTY_right_left": sum(qwerty.freq_QWERTY_right_left)})
    qwerty_all = pd.concat([pd.DataFrame(data), qwerty], ignore_index=True)
    qwerty_all = qwerty_all.sort_values("freq_QWERTY_right_left", ascending=False).reset_index()
    
    qwerty_m = df.loc[list(QWERTY_m_right_left) + list(QWERTY_m_left_right),]
    qwerty_m.dropna(axis=0, inplace=True)
    qwerty_m = qwerty_m.reset_index()
    qwerty_m.columns = ["QWERTY_m_right_left", "freq_QWERTY_m_right_left"]
    data = []
    data.insert(0, {"QWERTY_m_right_left": "sum", "freq_QWERTY_m_right_left": sum(qwerty_m.freq_QWERTY_m_right_left)})
    qwerty_m_all = pd.concat([pd.DataFrame(data), qwerty_m], ignore_index=True)
    qwerty_m_all = qwerty_m_all.sort_values("freq_QWERTY_m_right_left", ascending=False).reset_index()
    
    result = pd.concat([seq_all, qwerty_all,qwerty_m_all], axis=1, ignore_index=False)
    result = result[
        [
            "ABC_right_left",
            "freq_ABC_right_left",
            "QWERTY_right_left",
            "freq_QWERTY_right_left",
            "QWERTY_m_right_left",
            "freq_QWERTY_m_right_left",
        ]
    ]
    return result

col = [
"book No.",
"num_combinations",
"ABC%",
"QWERTY%",
"QWERTY_m%",
"QWERTY vs ABC %",
"QWERTY_m vs QWERTY %",
"QWERTY_m vs ABC %",
]

def eva(result,i,ni):
    '''
    获得每个文字数据集的各种打字机（键盘）参数百分比
    '''
    abc_per = round(result.freq_ABC[0] / n * 100, 3)
    QWERTY_per = round(result.freq_QWERTY[0] / n * 100, 3)
    QWERTY_m_per = round(result.freq_QWERTY_m[0] / n * 100, 3)

    delta = round((QWERTY_per - abc_per) / abc_per * 100, 3)
    delta2 = round((QWERTY_m_per - QWERTY_per) / QWERTY_m_per * 100, 3)
    delta3 = round((QWERTY_m_per - abc_per) / abc_per * 100, 3)

    compare_n = pd.DataFrame(
        [[i,ni, abc_per, QWERTY_per, QWERTY_m_per, delta, delta2, delta3]], columns=col
    )
    return compare_n

col_lr = [
    "book No.",
    "num_combinations",
    "ABC_right_left%",
    "QWERTY_right_left%",
    "QWERTY_m_right_left%",
    "QWERTY vs ABC right left %",
    "QWERTY_m vs QWERTY %"
]

def eva_lr(result,i,ni):
    '''
    获得每个文字数据集的各种打字机（键盘）参数百分比
    '''
    ABC_right_left_per = round(result.freq_ABC_right_left[0] / n * 100, 3)
    QWERTY_right_left_per = round(result.freq_QWERTY_right_left[0] / n * 100, 3)
    QWERTY_m_right_left_per = round(result.freq_QWERTY_m_right_left[0] / n * 100, 3)

    delta = round((QWERTY_right_left_per - ABC_right_left_per) / ABC_right_left_per * 100, 3)
    delta2 = round((QWERTY_m_right_left_per - QWERTY_right_left_per) / QWERTY_right_left_per * 100, 3)
    
    compare_n = pd.DataFrame(
        [[i,ni, ABC_right_left_per, QWERTY_right_left_per, QWERTY_m_right_left_per, delta,delta2]], columns=col_lr
    )
    return compare_n

def weighted_avg_and_std(data, weights):
    """
    计算加权平均和加权标准差
    """
    average = np.average(data, weights=weights)
    # Fast and numerically precise:
    variance = np.average((data-average)**2, weights=weights)
    return (average, math.sqrt(variance))


def weighted_mean_confidence_interval(data, weights, confidence=0.95):
    """
    计算加权置信区间
    """
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = weighted_avg_and_std(data, weights)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2.0, n - 1)
    print("Weighted Mean: %.3f \nWeighted %0.4f Confidence Interval: [%.3f,%.3f]"% (m, confidence, m - h, m + h))

def product(a, b):
    '''
    输入所有左（右）手打的字母array a 
    和所有右（左）手打的字母array b，
    输出所有可能的左右连续打字的组合，
    顺序从a到b
    '''
    c = list(itertools.product(a, b))
    QWERTY_right_left = pd.DataFrame(c).dropna()
    QWERTY_right_left = QWERTY_right_left[0] + QWERTY_right_left[1]
    return QWERTY_right_left

1. 打字机中，相邻字母（符号）组合有哪些？左右手打字的字母分别有哪些？¶

其中 “ABC” 代表 ABCD 打字机的相邻字母的组合，“QWERTY” 代表第一版 QWE.TY 打字机相邻字母组合，“QWERTY_m” 代表现在键盘相邻字母组合。
“ABC_left” 代表 ABCD 打字机的所有左手打的字母，“ABC_right” 代表 ABCD 打字机的所有右手打的字母。第一版 QWE.TY 打字机亦然。
把原文的大小写字母的文章，改成全小写，以进行更全面的统计。
由于不同打字机字母组合数量不同，请忽略表格末尾出现的 NaN。

# get the target combinations
target = pd.read_csv("target.csv")
target

2. 相邻字母（符号）出现的频率对比：¶

ABCD 打字机，第一版 QWE.TY 打字机，以及现代 QWERTY 键盘¶

获得整本圣经（美国标准版）数据¶

数据来源：American Standard Version (ASV) https://bible4u.net/en/download#ASV![image.png](attachment:image.png)
以下省略（用”#“注释掉）了单独圣经的分析结果。

#load bible
text1 = open("books/Bible_ASV.txt", "r").read().lower()
df, n = split_freq(text1)
result1 = get_result(target, df)
result1.to_csv("result/Bible_ASV.csv")
#result1

eva(result1,"bible",n)

获得 30 本打字机发明的年代的美国畅销书数据¶

数据来源： https://en.wikipedia.org/wiki/1870_in_literature ，https://en.wikipedia.org/wiki/1871_in_literature ， https://en.wikipedia.org/wiki/1872_in_literature ...
以下省略（用”#“注释掉）了单独对畅销书的分析结果。

t1 = open("books/bad_boy.txt", "r").read().lower()
t2 = open("books/Blue_Jackets.txt", "r").read().lower()
t3 = open("books/From_the_Earth_to_the_Moon.txt", "r").read().lower()
t4 = open("books/Joseph_and_His_Friend.txt", "r").read().lower()
t5 = open("books/Lothair.txt", "r").read().lower()
t6 = open("books/Man_and_Wife.txt", "r").read().lower()
t7 = open("books/Memoir_of_Jane_Austen.txt", "r").read().lower()
t8 = open("books/The_Adventures_of_Harry_Richmond.txt", "r").read().lower()
t9 = open("books/The_Caged_Lion.txt", "r").read().lower()
t10 = open("books/The_Earthly_Paradis.txt", "r").read().lower()
t11 = open("books/The_Mystery_of_Edwin_Drood.txt", "r").read().lower()
t12 = open("books/The_Visionary.txt", "r").read().lower()
t13 = open("books/The_Vicar_of_Bullhampton_by_Anthony_Trollope.txt", "r").read().lower()
t14 = open("books/The_Wild_Garden.txt", "r").read().lower()
t15 = open("books/Twenty_Thousand_Leagues_under_the_Sea_by_Jules_Verne.txt", "r").read().lower()
t16 = open("books/Venus_in_Furs_by_Ritter_von_Leopold_Sacher-Masoch.txt", "r").read().lower()
t17 = open("books/The_Adventures_of_Tom_Sawyer.txt", "r").read().lower()
t18 = open("books/Atthe_Back_of_the_North_Wind.txt", "r").read().lower()
t19 = open("books/Coles_Funny_Picture_Book.txt", "r").read().lower()
t20 = open("books/The_Cuckoo_Clock.txt", "r").read().lower()
t21 = open("books/The_Lost_Princess.txt", "r").read().lower()
t22 = open("books/Mildred_Keith.txt", "r").read().lower()
t23 = open("books/The_Princess_and_the_Goblin.txt", "r").read().lower()
t24 = open("books/What_Katy_Did.txt", "r").read().lower()
t25 = open("books/Under_the_Window.txt", "r").read().lower()
t26 = open("books/Carmilla.txt", "r").read().lower()
t27 = open("books/Erewhon.txt", "r").read().lower()
t28 = open("books/Daisy_Miller.txt", "r").read().lower()
t29 = open("books/Leavenworth.txt", "r").read().lower()
t30 = open("books/Rosein_Bloom.txt", "r").read().lower()

text2 = (
    t1 + t2 + t3 + t4 + t5 + t6 + t7 + t8 + t9 + t10 + t11
    + t12 + t13 + t14 + t15 + t16 + t17 + t18 + t19 + t20 
    + t21 + t22 + t23 + t24 + t25 + t26 + t27 + t28 + t29 + t30
)

df, n = split_freq(text2)
result2 = get_result(target, df)
result2.to_csv("result/books30.csv")
#result2

eva(result2,"2-30",n)

相邻字母（符号）出现频率的分析（圣经 + 30本畅销书）¶

以下是三种打字机（键盘）的相邻字母（符号）组合在圣经 + 30 本畅销书，也就是31本文学作品中出现的频次。”sum“ 表示所有之和。

其中，"word" 代表这种打字机键盘的相邻字母组合，"freq" 代表字母组合出现的频率。

text3 = text1 + text2
df, n = split_freq(text3)
result3 = get_result(target, df)
result3.to_csv("result/books_and_bible.csv")
result3

以下是对每一本文学作品做的结果统计：

"book No." 表示数的编号。1 号是圣经，2-31 号是畅销书，对应以上获得数据时的 variable t2,t3...
"num_combinations" 表示每本书所有文字数据产生的字母（符号）组合的个数。
"ABC%" 代表 ABCD 打字机相邻字母组合数量占所有字母组合数量的百分比。QWE.TY 打字机，或者 QWERTY 键盘亦然。
"ABC vs QWERTY %" 代表 (ABC% - QWERTY%)/ABC%，其他 vs 组合亦然。

eva(result3,"all",n)

compare = pd.DataFrame(columns=col)
text_iter = [text1,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,
             t16,t17,t18,t19,t20,t21,t22,t23,t24,t25,t26, t27,t28,t29,t30,
]
c=0
for i in text_iter:
    c+=1
    dfi, ni = split_freq(i)
    result_i = get_result(target, dfi)
    compare_i = eva(result_i,c,ni)
    compare = compare.append(compare_i)
compare = compare.sort_values("num_combinations", ascending=False).reset_index().drop(['index'], axis=1)
compare

第一版 QWE.TY 打字机比 ABCD 打字机¶

第一版 QWE.TY 打字机与 ABCD 键盘相比，在打字机遇到相邻字母同时出现卡壳的情况减少了 75.541%. 我们看到了所有文字作为整体而产生的数据，那么单独每本书，趋势是不是差不多呢？这个卡壳情况减少的 99.99% 置信区间又有多少呢？

data = compare["QWERTY vs ABC %"]
weights = compare["num_combinations"]
confidence=0.9999
weighted_mean_confidence_interval(data,weights,confidence)

Weighted Mean: -76.054 
Weighted 0.9999 Confidence Interval: [-88.947,-63.161]

对于我们的数据集 31 本文学作品来说，第一版 QWE.TY 打字机与 ABCD 键盘相比，打字机遇到相邻字母同时出现卡壳的情况，用每本书的字数做为权重，平均减少了 76%。
对于打字员敲打那个年代的英文文学作品，我们有 99.99% 的信心，第一版 QWE.TY 打字机比 ABCD 打字机减少相邻字母卡壳概率的加权平均值在 64% 到 89% 之间。

现代 QWERTY 键盘比第一版 QWE.TY 打字机¶

data = compare["QWERTY_m vs QWERTY %"]
weights = compare["num_combinations"]
confidence=0.9999
weighted_mean_confidence_interval(data,weights,confidence)

Weighted Mean: 76.863 
Weighted 0.9999 Confidence Interval: [65.386,88.340]

对于我们的数据集31本文学作品来说，现代 QWERTY 键盘与第一版 QWE.TY 键盘相比，现代 QWERTY 键盘没有了相邻字母卡壳的风险，所以大胆增加了遇到相邻字母组合的概率达到加权平均的 77%，以提高打字效率。
对于打字员敲打那个年代的英文文学作品，我们有 99.99% 的信心，现代 QWERTY 键盘比第一版 QWE.TY 键盘增加遇到相邻字母组合的概率的加权平均值在 65% 到 88.3% 之间。
那么现代 QWERTY 键盘到底增加了哪些相邻字母的组合呢？

找出第一版 QWE.TY 打字机没有，而现代 QWERTY 键盘新增的相邻字母组合¶

lis = []
for i in result3["word_QWERTY_m"]:
    if i not in list(result3["word_QWERTY"]):
        a = result3["freq_QWERTY_m"][result3["word_QWERTY_m"] == i].values[0]
        lis.append([i, a])
diff = pd.DataFrame(lis).sort_values(1, ascending=False)
diff.columns = ['word_QWERTY_m', 'freq_QWERTY_m']
diff

3. 左右手交叉使用来提高效率的分析¶

获得不同打字机所有可能的左右连续打字的组合¶

a = target.QWERTY_right
b = target.QWERTY_left

ABC_right_left = product(target.ABC_right, target.ABC_left)
ABC_left_right = product(target.ABC_left, target.ABC_right)
QWERTY_right_left = product(target.QWERTY_right, target.QWERTY_left)
QWERTY_left_right = product(target.QWERTY_left, target.QWERTY_right)
QWERTY_m_right_left = product(target.QWERTY_m_right, target.QWERTY_m_left)
QWERTY_m_left_right = product(target.QWERTY_m_left, target.QWERTY_m_right)

result_lr = get_result_lr(QWERTY_right_left,QWERTY_left_right,ABC_right_left,ABC_left_right,QWERTY_m_right_left,QWERTY_m_left_right, df)
result_lr.to_csv("result/left_right.csv")
result_lr.head(30)

eva_lr(result_lr,"all",n)

compare = pd.DataFrame(columns=col_lr)
c=0
for i in text_iter:
    c+=1
    dfi, ni = split_freq(i)
    result_i = get_result_lr(QWERTY_right_left,QWERTY_left_right,ABC_right_left,ABC_left_right,QWERTY_m_right_left,QWERTY_m_left_right, dfi)
    compare_i = eva_lr(result_i,c,ni)
    compare = compare.append(compare_i)
compare_lr = compare.sort_values("num_combinations", ascending=False).reset_index().drop(['index'], axis=1)
compare_lr

data = compare_lr["QWERTY vs ABC right left %"]
weights = compare_lr["num_combinations"]
confidence=0.9999
weighted_mean_confidence_interval(data,weights,confidence)

Weighted Mean: 30.825 
Weighted 0.9999 Confidence Interval: [5.683,55.967]

对于我们的数据集 31 本文学作品来说，第一版 QWE.TY 打字机与 ABCD 打字机相比，打字机遇到左右手交替使用的情况，用每本书的字数做为权重，平均增加了 31%。
对于打字员敲打所有那个年代的英文文学作品，我们有 99.99% 的信心，第一版 QWE.TY 打字机比 ABCD 打字机增加了左右手交替使用的情况，增加的范围在 6% 到 56% 之间。

data = compare_lr["QWERTY_m vs QWERTY %"]
weights = compare_lr["num_combinations"]
confidence=0.9999
weighted_mean_confidence_interval(data,weights,confidence)

Weighted Mean: -2.413 
Weighted 0.9999 Confidence Interval: [-11.531,6.706]

a = compare_lr["QWERTY_right_left%"]
b = compare_lr["QWERTY_m_right_left%"]
t, p = ttest_ind(a, b, equal_var=False)
print("t-value: {} \np-value: {}".format(t,p))

t-value: 0.06988645232474403 
p-value: 0.9445164611688741

对于我们的数据集 31 本文学作品来说，现代 QWERTY 键盘与第一版 QWE.TY 打字机相比，打字机遇到左右手交替使用的情况，用每本书的字数做为权重，平均减少了 -2.4%。
但是，由于 99.99% 的置信区间在 -11.5% 到 6.7% 之间，中间有个 0 。所以我们不能判断，对于打字员敲打所有那个年代的英文文学作品，现代 QWERTY 键盘比第一版 QWE.TY 打字机增加了或者减少了左右手交替使用的情况。
更进一步，我们做了两种打字机数据的 t 检验，发现 p-value = 0.94 远远大于 0.05，所以我们不能否定 null hypothesis，即对于现代 QWERTY 键盘与第一版 QWE.TY 打字机，打字机遇到左右手交替使用的情况是一样的。

注释¶

在处理文字内容时，没有单独处理换行相关的 / 符号。并且由于不同的书尺寸不同，每行字母数量不一，实际打字机换行的操作也不确定会在哪里出现，所以统计就忽略换行符号和换行操作不计了。
这里只统计了打字机发明年代的畅销书中能够获得txt版本电子书的部分畅销书，严格来说，不能算做随机样本。有可能由于没有txt版本的书的某些共性的原因，导致他们没有电子版，并且这个共性的原因也影响着不同打字机的相邻字母组合在书中出现的概率，那么这份样本不能代表当年整体英文文学作品。但是这个可能性非常小。所以我们假设选取的圣经美国标准版和畅销书样本能代表当年整体英文文学作品。
样本容量是 31 > 30，根据中心极限定理，我们就可以认为样本的平均值符合正态分布，依次我们算出以上置信区间。
第一版 QWERT.Y 键盘是根据 Wikipedia 来的，上面省略了 1 和句号 . 的位置。所以 12 相连的字母组合，以及句号 . 和别的什么字母（符号）相连没有统计进去。
我们所使用的不同版本的打字机（键盘）的图片上显示，能打出的字符和数字是不同的。比如 ABCD 打字机和 QWE.TY 打字机都没有数字 1 可以打印。ABCD 打字机字符只有 - .，而 QWE.TY 打字机的字符有 - ; , . ? &。对于现代 QWERTY 键盘来说，因为能打的字符实在太多了，全部统计的话，就不公平，不是 apple to apple 的比较了。所以这里是统计直接与数字或者字母相邻的字符行程的组合，并且去掉 12 这个早期打字机版本的图片里面没有出现的组合。于是就保留了字符 - ; , 和数字 0。最后是的可比较的相邻字母组合，对于 ABCD 打字机有 35 组，对于 QWE.TY 打字机，有 33 组，对于 QWERTY 打字机，有 35 组。

返回科学声音主页

	word_ABC	freq_ABC	word_QWERTY	freq_QWERTY	word_QWERTY_m	freq_QWERTY_m
0	sum	1161700.0	sum	276976.0	sum	1214090.0
1	hi	265300.0	we	105130.0	er	504450.0
2	st	215028.0	e.	57600.0	as	226028.0
3	no	189304.0	op	37946.0	we	105130.0
4	de	147520.0	ty	35024.0	gh	83976.0
5	gh	83976.0	yi	11390.0	io	82904.0
6	rs	81982.0	lm	8472.0	rt	68268.0
7	ab	48146.0	23	3578.0	op	37946.0
8	op	37946.0	kl	3510.0	ty	35024.0
9	tu	37902.0	n?	3134.0	m,	23564.0
10	ef	35176.0	iu	2594.0	ui	22496.0
11	lm	8472.0	df	1898.0	xc	6456.0
12	kl	3510.0	ax	1844.0	23	3578.0
13	24	3306.0	34	1448.0	kl	3510.0
14	35	1370.0	sd	1166.0	l;	2794.0
15	46	696.0	45	752.0	df	1898.0
16	ij	510.0	bn	346.0	34	1448.0
17	8.	366.0	78	290.0	sd	1166.0
18	57	300.0	56	220.0	nm	1090.0
19	68	230.0	89	200.0	45	752.0
20	uv	184.0	67	118.0	bn	346.0
21	xy	156.0	&c	92.0	78	290.0
22	79	108.0	-,	82.0	90	250.0
23	cd	80.0	.t	64.0	56	220.0
24	.a	72.0	9-	58.0	89	200.0
25	yz	42.0	hj	12.0	67	118.0
26	bc	10.0	fg	6.0	yu	100.0
27	fg	6.0	zs	2.0	0-	68.0
28	qr	2.0	NaN	NaN	hj	12.0
29	NaN	NaN	NaN	NaN	fg	6.0
30	NaN	NaN	NaN	NaN	vb	2.0

	book No.	num_combinations	ABC%	QWERTY%	QWERTY_m%	QWERTY vs ABC %	QWERTY_m vs QWERTY %	QWERTY_m vs ABC %
0	1	10405529	0.680	0.151	0.575	-77.794	73.739	-15.441
1	7	2595957	0.181	0.045	0.196	-75.138	77.041	8.287
2	9	2420593	0.159	0.042	0.187	-73.585	77.540	17.610
3	14	2284279	0.168	0.036	0.177	-78.571	79.661	5.357
4	6	1765541	0.126	0.029	0.138	-76.984	78.986	9.524
5	20	1747407	0.104	0.026	0.109	-75.000	76.147	4.808
6	3	1629763	0.114	0.024	0.128	-78.947	81.250	12.281
7	30	1260439	0.088	0.019	0.095	-78.409	80.000	7.955
8	10	1237183	0.090	0.022	0.097	-75.556	77.320	7.778
9	5	1231013	0.087	0.020	0.094	-77.011	78.723	8.046
10	16	1218109	0.083	0.024	0.090	-71.084	73.333	8.434
11	4	1136157	0.079	0.020	0.089	-74.684	77.528	12.658
12	12	1125157	0.081	0.019	0.086	-76.543	77.907	6.173
13	31	1051829	0.072	0.018	0.080	-75.000	77.500	11.111
14	19	973455	0.069	0.015	0.069	-78.261	78.261	0.000
15	28	966265	0.069	0.016	0.079	-76.812	79.747	14.493
16	11	832099	0.062	0.012	0.057	-80.645	78.947	-8.065
17	18	825431	0.055	0.014	0.059	-74.545	76.271	7.273
18	23	763607	0.049	0.014	0.060	-71.429	76.667	22.449
19	15	693425	0.044	0.014	0.053	-68.182	73.585	20.455
20	2	668417	0.045	0.012	0.054	-73.333	77.778	20.000
21	8	632741	0.044	0.010	0.053	-77.273	81.132	20.455
22	25	607821	0.036	0.013	0.048	-63.889	72.917	33.333
23	24	603043	0.040	0.010	0.045	-75.000	77.778	12.500
24	22	563415	0.037	0.010	0.041	-72.973	75.610	10.811
25	21	524651	0.033	0.007	0.038	-78.788	81.579	15.152
26	17	509195	0.032	0.008	0.036	-75.000	77.778	12.500
27	13	451431	0.033	0.007	0.037	-78.788	81.081	12.121
28	27	352813	0.023	0.006	0.029	-73.913	79.310	26.087
29	29	283431	0.018	0.005	0.024	-72.222	79.167	33.333
30	26	82473	0.005	0.002	0.006	-60.000	66.667	20.000

	word_QWERTY_m	freq_QWERTY_m
0	er	504450.0
1	as	226028.0
2	gh	83976.0
3	io	82904.0
4	rt	68268.0
5	m,	23564.0
6	ui	22496.0
7	xc	6456.0
8	l;	2794.0
9	nm	1090.0
10	90	250.0
11	yu	100.0
12	0-	68.0
13	vb	2.0

	book No.	num_combinations	ABC_right_left%	QWERTY_right_left%	QWERTY_m_right_left%	QWERTY vs ABC right left %	QWERTY_m vs QWERTY %
0	1	10405529	6.213	8.651	8.734	39.240	0.959
1	7	2595957	1.694	2.153	2.075	27.096	-3.623
2	9	2420593	1.585	1.970	1.916	24.290	-2.741
3	14	2284279	1.477	1.883	1.797	27.488	-4.567
4	6	1765541	1.177	1.533	1.479	30.246	-3.523
5	20	1747407	0.987	1.324	1.287	34.144	-2.795
6	3	1629763	1.066	1.351	1.293	26.735	-4.293
7	30	1260439	0.825	1.019	0.989	23.515	-2.944
8	10	1237183	0.816	1.044	1.005	27.941	-3.736
9	5	1231013	0.791	1.013	0.970	28.066	-4.245
10	16	1218109	0.798	1.022	0.992	28.070	-2.935
11	4	1136157	0.772	0.972	0.934	25.907	-3.909
12	12	1125157	0.731	0.922	0.895	26.129	-2.928
13	31	1051829	0.671	0.853	0.828	27.124	-2.931
14	19	973455	0.592	0.777	0.753	31.250	-3.089
15	28	966265	0.636	0.822	0.787	29.245	-4.258
16	11	832099	0.487	0.672	0.647	37.988	-3.720
17	18	825431	0.532	0.672	0.653	26.316	-2.827
18	23	763607	0.474	0.628	0.601	32.489	-4.299
19	15	693425	0.467	0.581	0.552	24.411	-4.991
20	2	668417	0.451	0.554	0.536	22.838	-3.249
21	8	632741	0.412	0.525	0.503	27.427	-4.190
22	25	607821	0.379	0.482	0.469	27.177	-2.697
23	24	603043	0.379	0.491	0.478	29.551	-2.648
24	22	563415	0.357	0.473	0.454	32.493	-4.017
25	21	524651	0.327	0.424	0.409	29.664	-3.538
26	17	509195	0.319	0.413	0.396	29.467	-4.116
27	13	451431	0.291	0.378	0.364	29.897	-3.704
28	27	352813	0.232	0.292	0.281	25.862	-3.767
29	29	283431	0.190	0.232	0.226	22.105	-2.586
30	26	82473	0.050	0.061	0.060	22.000	-1.639

	ABC	QWERTY	QWERTY_m	ABC_left	ABC_right	QWERTY_left	QWERTY_right	QWERTY_m_left	QWERTY_m_right
0	-3	23	23	-	r	2	7	2	7
1	35	34	34	3	s	3	8	3	8
2	57	45	45	5	t	4	9	4	9
3	79	56	56	7	u	5	-	5	0
4	9n	67	67	9	v	6	,	6	-
5	no	78	78	n	w	q	y	q	y
6	op	89	89	o	x	w	i	w	u
7	pq	9-	90	p	y	e	u	e	i
8	qr	-,	0-	q	z	.	o	r	o
9	rs	qw	qw	2	e	t	p	t	p
10	st	we	we	4	f	z	h	a	h
11	tu	e.	er	6	g	s	j	s	j
12	uv	.t	rt	8	h	d	k	d	k
13	vw	ty	ty	.	i	f	l	f	l
14	wx	yi	yu	a	j	g	m	g	n
15	xy	iu	ui	b	k	a	b	z	m
16	yz	op	io	c	l	x	n	x	,
17	24	zs	op	d	m	&	?	c	.
18	46	sd	as	NaN	NaN	c	;	v	;
19	68	df	sd	NaN	NaN	v	r	b	:
20	8.	fg	df	NaN	NaN	NaN	NaN	NaN	NaN
21	.a	hj	fg	NaN	NaN	NaN	NaN	NaN	NaN
22	ab	kl	gh	NaN	NaN	NaN	NaN	NaN	NaN
23	bc	lm	hj	NaN	NaN	NaN	NaN	NaN	NaN
24	cd	ax	jk	NaN	NaN	NaN	NaN	NaN	NaN
25	de	x&	kl	NaN	NaN	NaN	NaN	NaN	NaN
26	ef	&c	l;	NaN	NaN	NaN	NaN	NaN	NaN
27	fg	cv	zx	NaN	NaN	NaN	NaN	NaN	NaN
28	gh	bn	xc	NaN	NaN	NaN	NaN	NaN	NaN
29	hi	n?	cv	NaN	NaN	NaN	NaN	NaN	NaN
30	ij	?;	vb	NaN	NaN	NaN	NaN	NaN	NaN
31	jk	;R	bn	NaN	NaN	NaN	NaN	NaN	NaN
32	kl	NaN	nm	NaN	NaN	NaN	NaN	NaN	NaN
33	lm	NaN	m,	NaN	NaN	NaN	NaN	NaN	NaN

	ABC_right_left	freq_ABC_right_left	QWERTY_right_left	freq_QWERTY_right_left	QWERTY_m_right_left	freq_QWERTY_m_right_left
0	sum	10848723.0	sum	14168417.0	sum	13826705.0
1	in	524348.0	th	1018544.0	th	1018544.0
2	ha	337992.0	he	955600.0	he	955600.0
3	ou	329048.0	an	546608.0	an	546608.0
4	at	317570.0	er	504450.0	nd	425868.0
5	en	306508.0	nd	425868.0	ha	337992.0
6	ed	294156.0	re	408762.0	en	306508.0
7	or	277918.0	ha	337992.0	or	277918.0
8	to	277216.0	en	306508.0	to	277216.0
9	ng	248766.0	to	277216.0	it	265516.0
10	of	246634.0	it	265516.0	is	253572.0
11	ar	234122.0	is	253572.0	ng	248766.0
12	as	226028.0	ng	248766.0	of	246634.0
13	ot	200408.0	of	246634.0	ot	200408.0
14	ea	190388.0	ar	234122.0	le	199114.0
15	nt	184092.0	ot	200408.0	me	192194.0
16	al	182360.0	le	199114.0	nt	184092.0
17	ne	174548.0	me	192194.0	al	182360.0
18	ho	169644.0	nt	184092.0	ne	174548.0
19	fo	165292.0	al	182360.0	fo	165292.0
20	be	164056.0	ne	174548.0	ro	155642.0
21	ro	155642.0	fo	165292.0	ti	151844.0
22	de	147520.0	be	164056.0	sh	145856.0
23	wa	140816.0	ti	151844.0	ri	138692.0
24	ch	131350.0	sh	145856.0	el	137120.0
25	om	128146.0	el	137120.0	co	132332.0
26	ma	126754.0	co	132332.0	ch	131350.0
27	ai	123014.0	ch	131350.0	wi	129666.0
28	ce	117200.0	wi	129666.0	ut	129210.0
29	un	111034.0	ut	129210.0	wh	127452.0

	ABC	QWERTY	QWERTY_m	ABC_left	ABC_right	QWERTY_left	QWERTY_right	QWERTY_m_left	QWERTY_m_right
0	-3	23	23	-	r	2	7	2	7
1	35	34	34	3	s	3	8	3	8
2	57	45	45	5	t	4	9	4	9
3	79	56	56	7	u	5	-	5	0
4	9n	67	67	9	v	6	,	6	-
5	no	78	78	n	w	q	y	q	y
6	op	89	89	o	x	w	i	w	u
7	pq	9-	90	p	y	e	u	e	i
8	qr	-,	0-	q	z	.	o	r	o
9	rs	qw	qw	2	e	t	p	t	p
10	st	we	we	4	f	z	h	a	h
11	tu	e.	er	6	g	s	j	s	j
12	uv	.t	rt	8	h	d	k	d	k
13	vw	ty	ty	.	i	f	l	f	l
14	wx	yi	yu	a	j	g	m	g	n
15	xy	iu	ui	b	k	a	b	z	m
16	yz	op	io	c	l	x	n	x	,
17	24	zs	op	d	m	&	?	c	.
18	46	sd	as	NaN	NaN	c	;	v	;
19	68	df	sd	NaN	NaN	v	r	b	:
20	8.	fg	df	NaN	NaN	NaN	NaN	NaN	NaN
21	.a	hj	fg	NaN	NaN	NaN	NaN	NaN	NaN
22	ab	kl	gh	NaN	NaN	NaN	NaN	NaN	NaN
23	bc	lm	hj	NaN	NaN	NaN	NaN	NaN	NaN
24	cd	ax	jk	NaN	NaN	NaN	NaN	NaN	NaN
25	de	x&	kl	NaN	NaN	NaN	NaN	NaN	NaN
26	ef	&c	l;	NaN	NaN	NaN	NaN	NaN	NaN
27	fg	cv	zx	NaN	NaN	NaN	NaN	NaN	NaN
28	gh	bn	xc	NaN	NaN	NaN	NaN	NaN	NaN
29	hi	n?	cv	NaN	NaN	NaN	NaN	NaN	NaN
30	ij	?;	vb	NaN	NaN	NaN	NaN	NaN	NaN
31	jk	;R	bn	NaN	NaN	NaN	NaN	NaN	NaN
32	kl	NaN	nm	NaN	NaN	NaN	NaN	NaN	NaN
33	lm	NaN	m,	NaN	NaN	NaN	NaN	NaN	NaN