diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..dbe9c82b3610ccd58d1c681848dcd322e500051e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.vscode/
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000000000000000000000000000000000000..87634fa2cb12701860989a0ebeb17374dccd71e9
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,58 @@
+{
+    "files.associations": {
+        "scx_sjf_common.h": "c",
+        "array": "c",
+        "atomic": "c",
+        "bit": "c",
+        "*.tcc": "c",
+        "cctype": "c",
+        "chrono": "c",
+        "clocale": "c",
+        "cmath": "c",
+        "compare": "c",
+        "concepts": "c",
+        "cstdarg": "c",
+        "cstddef": "c",
+        "cstdint": "c",
+        "cstdio": "c",
+        "cstdlib": "c",
+        "cstring": "c",
+        "ctime": "c",
+        "cwchar": "c",
+        "cwctype": "c",
+        "deque": "c",
+        "map": "c",
+        "string": "c",
+        "unordered_map": "c",
+        "vector": "c",
+        "exception": "c",
+        "algorithm": "c",
+        "functional": "c",
+        "iterator": "c",
+        "memory": "c",
+        "memory_resource": "c",
+        "numeric": "c",
+        "random": "c",
+        "ratio": "c",
+        "string_view": "c",
+        "system_error": "c",
+        "tuple": "c",
+        "type_traits": "c",
+        "utility": "c",
+        "initializer_list": "c",
+        "iosfwd": "c",
+        "iostream": "c",
+        "istream": "c",
+        "limits": "c",
+        "new": "c",
+        "numbers": "c",
+        "ostream": "c",
+        "semaphore": "c",
+        "sstream": "c",
+        "stdexcept": "c",
+        "stop_token": "c",
+        "streambuf": "c",
+        "thread": "c",
+        "typeinfo": "c"
+    }
+}
\ No newline at end of file
diff --git a/README.md b/README.md
deleted file mode 100644
index 3bd3470e814b09d7d91453c637701fcc2fa6eb15..0000000000000000000000000000000000000000
--- a/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# proj134-CFS-based-userspace-scheduler
-
-## 说明
-
-本仓库是哈尔滨工业大学(深圳)COS队的项目仓库,赛题为[proj134-CFS-based-userspace-scheduler](https://github.com/oscomp/proj134-CFS-based-userspace-scheduler)
-
diff --git a/client/Makefile b/client/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..add1b1e87d0af5ebf7d773753105ffe07683034e
--- /dev/null
+++ b/client/Makefile
@@ -0,0 +1,23 @@
+CC = g++
+INCLUDES :=  $(CURDIR)/../lib
+CFLAGS = -std=c++11 -Wall -I$(INCLUDES)
+LDFLAGS = 
+
+TARGET = simple_client
+SRCS = simple_client.cpp 
+OBJS = $(SRCS:.cpp=.o)
+
+all: $(TARGET)
+
+$(TARGET): $(OBJS)
+	$(CC) $(CFLAGS) $(LDFLAGS) $^ -o $@
+
+cos_client: cos_client.o
+	$(CC) $(CFLAGS) $(LDFLAGS) $^ -o $@
+
+%.o: %.cpp
+	$(CC) $(CFLAGS) $(LDFLAGS) -c $< -o $@
+
+clean:
+	rm -rf $(TARGET) cos_client cos_client.o $(OBJS) shm*
+
diff --git a/client/cos_client.cpp b/client/cos_client.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b5690cf8d6eead79088d884059ff1681ad1f8a4c
--- /dev/null
+++ b/client/cos_client.cpp
@@ -0,0 +1,63 @@
+#include <sched.h>
+#include <vector>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <fcntl.h> // for O_RDWR and open
+#include <sstream>   
+#include <cstring>
+#include <string.h>
+// include order matters
+#include "cos_client.h"
+#include "cos.h"
+#include "cos_thread.h"
+
+#include "hash.h"
+
+#define SCHED_EXT 7
+
+struct option{
+    int worker_size;
+};
+
+struct option get_options_from_args(int argc, char** argv){
+    return {100};
+}
+
+int main(int argc, char** argv){
+    struct option op = get_options_from_args(argc,argv);
+    printf("%d:aaaaaaa laotan\n",getpid());
+
+    char buf[128];
+    sprintf(buf,"/etc/cos/shm/shm_%d",getpid());
+    int shm_fd = open(buf, O_RDWR | O_CREAT | O_TRUNC,0644);
+    void* shm = mmap(NULL, SHM_SIZE,PROT_READ|PROT_WRITE, MAP_SHARED,shm_fd,0);
+    ftruncate(shm_fd, SHM_SIZE); 
+    memset(shm,0,SHM_SIZE);
+
+    LFHashTable<struct entry> hashtable(shm, SHM_SIZE, 0);
+    std::vector<std::unique_ptr<CosThread>> workers;
+    for(int i=0;i<op.worker_size;i++){
+        workers.emplace_back(new CosThread(CosThread::KernelSchedulerType::kExt, []{
+            while(true){ // do some work
+                printf("%d:working...%d\n",gettid(),sched_getscheduler(gettid()));
+                sleep(1);
+            }
+        }));
+    }
+
+    for (auto& t : workers) {
+        t->WaitUntilInitComplete();
+        int tid = t->tid();
+        hashtable.Add(tid, {tid});
+
+        struct sched_param param = { .sched_priority = 0 }; 
+        sched_setscheduler(tid, SCHED_EXT, &param); 
+
+        printf("%d:唤醒了%d\n",getpid(),tid);
+        t->NotifyWork();
+    }
+
+    for (auto& t : workers) t->Join();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/client/simple_client.cpp b/client/simple_client.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d2f65bd844db96d1a7422e7bb89efc9cab01b1ba
--- /dev/null
+++ b/client/simple_client.cpp
@@ -0,0 +1,48 @@
+#include <sched.h>
+#include <unistd.h>
+#include <vector>
+#include <thread>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <fcntl.h> // for O_RDWR and open
+#include <sstream>   
+#include <cstring>
+#include "hash.h"
+
+#define SCHED_EXT 7
+#define SHM_SIZE 4096
+
+
+int main(int argc, char** argv) {
+
+    char buf[32];
+    sprintf(buf, "/etc/cos/shm/shm_%d", getpid());
+    int shm_fd = open(buf, O_RDWR | O_CREAT | O_TRUNC, 0644);
+    ftruncate(shm_fd, SHM_SIZE); 
+    int* shm = (int*)mmap(NULL, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0);
+    memset(shm, 0, SHM_SIZE);
+
+    int baba = 10;
+    LFHashTable<int32_t> hashtable((void*)shm, SHM_SIZE, 0);
+    std::vector<std::thread> workers(baba);
+
+    for (int i = 0 ; i < baba; i++) {
+        workers[i] = std::thread ([&hashtable] { 
+
+            hashtable.Add(gettid(), gettid());
+
+            struct sched_param param = { .sched_priority = 0 }; 
+            sched_setscheduler(gettid(),SCHED_EXT,&param); 
+
+            printf("调度类为:%d\n",sched_getscheduler(gettid()));
+            
+            while(true){ 
+                printf("%d working...\n", gettid());
+                // sleep(1);
+            }
+        });
+    }
+    for (int i = 0 ; i < baba; i++) {
+        workers[i].join();
+    }
+}
\ No newline at end of file
diff --git a/gnu/stubs.h b/gnu/stubs.h
new file mode 100644
index 0000000000000000000000000000000000000000..719225b1662697f90ab04d9a0f6562e9b4bc34d0
--- /dev/null
+++ b/gnu/stubs.h
@@ -0,0 +1 @@
+/* dummy .h to trick /usr/include/features.h to work with 'clang -target bpf' */
diff --git a/lib/cos.h b/lib/cos.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e334848dc8445b9bc0e841090f47d8fde693e0a
--- /dev/null
+++ b/lib/cos.h
@@ -0,0 +1,7 @@
+struct entry;
+
+//每块shm的大小
+#define SHM_SIZE 4096
+
+// // shm的最大entry数
+// #define MAX_ENTRY_NUMS SHM_SIZE / sizeof(struct entry)
\ No newline at end of file
diff --git a/lib/cos_client.h b/lib/cos_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..88a129210eccb2284721de031f0e4b8647e2e2ec
--- /dev/null
+++ b/lib/cos_client.h
@@ -0,0 +1,6 @@
+// 必须详细定义
+// 为了能在agent也获取到,所以单独作为一个头文件
+// shm中具体记录信息的结构,每个线程都持有一个
+struct entry{
+    int ddl;
+};
\ No newline at end of file
diff --git a/lib/cos_thread.h b/lib/cos_thread.h
new file mode 100644
index 0000000000000000000000000000000000000000..239a1671b8ce7cea0d653fc8ffc6baeed2432667
--- /dev/null
+++ b/lib/cos_thread.h
@@ -0,0 +1,79 @@
+#include <thread>
+#include <functional>// for std::function
+#include <unistd.h>
+
+class CosThread {
+ public:
+  // The kernel scheduling class to run the thread in.
+  enum class KernelSchedulerType {
+    // Linux Completely Fair Scheduler.
+    kCfs,
+    // ext.
+    kExt,
+  };
+
+  explicit CosThread(KernelSchedulerType ksched, std::function<void()> work) {
+    work_start_ = false;
+    ksched_ = ksched;
+    thread_ = std::thread([this, w = std::move(work)] {
+        tid_ = gettid();
+        NotifyInitComplete();
+
+        // if (ksched_ == KernelSchedulerType::kExt) {
+          WaitUntilWork();
+        // }
+
+        std::move(w)();
+    });
+  }
+  explicit CosThread(const CosThread&) = delete;
+  CosThread& operator=(const CosThread&) = delete;
+  ~CosThread() = default;
+
+  // Joins the thread.
+  void Join() {
+    thread_.join();
+  }
+
+  void WaitUntilWork(){
+    while(!work_start_){
+      sched_yield();
+    }
+  }
+
+  void NotifyWork(){
+    work_start_ = true;
+  }
+
+  void WaitUntilInitComplete(){
+    while(!init_complete_){
+      sched_yield();
+    }
+  }
+
+  void NotifyInitComplete(){
+    init_complete_ = true;
+  }
+
+  bool Joinable() const { return thread_.joinable(); }
+
+  // 供外界调用。注意!会阻塞直到tid被初始化
+  int tid() { 
+    return tid_; 
+  }
+
+ private:
+  // 条件变量,表示是否可以开始work。由主线程标记为true。
+  volatile bool work_start_;
+
+  volatile bool init_complete_; 
+
+  // The thread's TID (thread identifier).
+  int tid_;
+
+  // The kernel scheduling class the thread is running in.
+  KernelSchedulerType ksched_;
+
+  // The thread.
+  std::thread thread_;
+};
\ No newline at end of file
diff --git a/lib/hash.h b/lib/hash.h
new file mode 100644
index 0000000000000000000000000000000000000000..82b325497715df4b84770c5eb121f8e71ea67b36
--- /dev/null
+++ b/lib/hash.h
@@ -0,0 +1,70 @@
+#include <iostream>
+#include <atomic>
+#include <cassert>
+
+template <typename THDINFO>
+struct Entry {
+    int32_t key;
+    THDINFO value;
+};
+
+// 用于client和agent之间通信的哈希表,key为tid,value为THDINFO需要传递的线程调度信息
+template <typename THDINFO>
+class LFHashTable {
+private:
+    // 用于进程间通信的共享内存
+    void* shd_mem_;
+    // 哈希表能存储kv对的个数
+    size_t capacity_;
+    // 存储哈希表的共享内存,是shd_mem_的不同表现形式
+    Entry<THDINFO>* table_;
+    // 关键信息,例如主线程id号,hash函数通过主线程id号来确定哈希函数
+    int32_t key_info_;
+
+public:
+    LFHashTable(){}
+
+    LFHashTable(void* shd_mem, size_t mem_size, int32_t key_info): shd_mem_(shd_mem), key_info_(key_info) {
+        capacity_ = mem_size / sizeof(Entry<THDINFO>);
+        table_ = reinterpret_cast<Entry<THDINFO>*>(shd_mem);
+    }
+
+    bool Add(int32_t key, THDINFO value) {
+        assert(key != 0);
+        for (int idx = hash(key); ; idx = (idx + 1) % capacity_) {
+            if (table_[idx].key == 0) {
+                if (!__sync_bool_compare_and_swap(&table_[idx].key, 0, key)) {
+                    continue;
+                }
+            }
+            if (table_[idx].key != key) {
+                continue;
+            }
+            table_[idx].value = value;
+            return true;
+        }
+    }
+
+    THDINFO Get(int32_t key) {
+        assert(key != 0);
+        for (int idx = hash(key); ; idx = (idx + 1) % capacity_) {
+            if (table_[idx].key == 0) {
+                return {};
+            }
+            if (table_[idx].key != key) {
+                continue;
+            }
+            return table_[idx].value;
+        }
+    }
+
+private:
+    int32_t hash(int32_t key) {
+        int father_tid = key_info_;
+        return (key - father_tid) % capacity_;
+    }
+
+};
+
+
+
diff --git a/record/dzh/face_book.md b/record/dzh/face_book.md
deleted file mode 100644
index 6973028385c70c30e0874b7af727c028db80fb73..0000000000000000000000000000000000000000
--- a/record/dzh/face_book.md
+++ /dev/null
@@ -1,19 +0,0 @@
-长期以来,Linux系统管理员、发行版开发者和应用程序所有者一直在调整位于/proc/sys中(现在在debugfs中)的CFS设置。实际上,这些设置的作用是更改任务抢占的可能性,或通过将wakeup_granularity_ns设置为大于latency_ns的一半来禁用它。其他设置对性能并没有太大影响。
-
-换句话说,对于一些工作负载,长时间运行的任务被处理短期请求的任务抢占可以提高性能,而对于一些只运行短期请求的工作负载,不被抢占反而更有利。
-
-这引发了一些观察和想法:
--不同的工作负载需要不同的策略。能够针对每个工作负载进行配置可能很有用。
--从不被抢占中获益的工作负载仍然可能从抢占(低优先级)后台系统任务中获益。
--在生产中快速(且安全地)尝试不同的策略,而无需关闭应用程序或重新启动系统,以确定不同工作负载的策略,将会很有用。
--只有很少的工作负载足够大且敏感,需要其自己的策略调整。对于其他所有情况,CFS本身应该足够好,我们可能不想将策略调整替换CFS所做的任何事情。
-
-这引出了BPF钩子。在各种内核子系统中,BPF钩子已被成功用于提供一种外部代码安全地更改一些内核决策的方式。BPF工具使这变得相当容易,部署BPF脚本的人已经习惯于为新内核版本更新它们。
-
-此补丁集旨在开始讨论BPF在调度程序中的潜在应用。它还旨在着陆一些非常基本的BPF基础架构,以添加新的BPF钩子到调度程序中,一组最小的有用的辅助程序,相应的libbpf更改等等。
-
-我们在CFS中使用BPF的第一次实验看起来非常有前途。我们处于非常早期的阶段,但是我们已经看到了我们(Facebook的)主要Web工作负载的良好延迟和约1%的RPS提升。
-
-据我所知,谷歌正在进行一种更激进的方法[2]:他们打算将调度代码移动到用户空间。看起来他们的核心动机有些类似:使调度器更易于开发、验证和部署。尽管他们的方法不同,但他们也使用BPF来加速一些热点路径。我认为建议的基础设施也可以为他们的目的服务。
-
-一个用户空间部分的例子,它加载了一些简单的挂钩,在这里[3]提供仅仅是为了简化使用提供的内核补丁的操作。
diff --git a/record/dzh/ghost_lib.md b/record/dzh/ghost_lib.md
deleted file mode 100644
index b541ad140473d7a5b18ffb50a59c53751cbc06a4..0000000000000000000000000000000000000000
--- a/record/dzh/ghost_lib.md
+++ /dev/null
@@ -1,435 +0,0 @@
-## 前言
-
-enclave, scheduler, agent, channel, message,statusword, statuswordtable, runrequest, task,ghost,cpu。这最核心的概念在ghost用户态调度框架里的关系是?
-
-
-
-## Agent
-
-#### Agent
-
-explain:论文中的agent,和cpu一一对应,属于enclave,和schduler一一对应,也就是说每一个agent都有自己的调度算法
-
-```
-StartBegin():agent线程开始执行threadbody,对于localagent是将当前agent迁移到其管理的cpu上
-StartComplete():等待enclave ready
-TerminateBegin():通知条件变量
-TerminateComplete():摧毁线程资源
-ThreadBody():agent线程所执行的函数
-AgentThread():在agent相关准备就绪后,由ThreadBody调度。由具体的调度算法的agent去实现
-Ping():让agent线程回到它所管理的cpu上执行
-SignalReady() :在agent初始化结束后唤醒start_complete(),让其可以接下来调用enclave的ready方法
-WaitForEnclaveReady() :等待所在enclave的ready
-AgentScheduler():返回agent的调度类,返回空值,被其他继承的调度类重写,其他继承的调度算法会有自己的Scheduler调度类
-
-Enclave* enclave_:agent所属enclave
-Gtid gtid_:agent的ghost线程号
-Cpu cpu_:agent所管理的cpu
-Notification ready_, finished_, enclave_ready_, do_exit_:相关用于同步的条件变量
-std::thread thread_:运行agent的线程
-```
-
-#### LocalAgent
-
-explain:继承agent,供其他调度算法继承,如FifoAgent。重写了ThreadBody(就是上面的),多了statusword字段
-
-```
-LocalStatusWord status_word_:通过此内核共享内存获取相关信息,如cpu空闲,Aseq等
-```
-
-#### FullAgent
-
-explain:将单个enclave下的agent,task,scheduler汇集起来,别的调度算法会新建一个个性化类去继承这个类,如FullFifoAgent,和enclave一一对应
-
-```
-StartAgentTasks():创建当前enclave下的所有cpu的agent,和它们对应的cpu绑定起来,并且一次调用它们的StartBegin方法迁移到对应cpu上运行
-TerminateAgentTasks():被派生类的析构方法调用
-
-LocalEnclave enclave_:一一对应的enclave
-std::vector<std::unique_ptr<Agent>> agents_:其中所包含的agent
-```
-
-
-#### FullFifoAgent(p)
-
-explain:继承FullAgent
-
-```
-FullFifoAgent(AgentConfig config):整个调度算法初始化的开始
-
-std::unique_ptr<FifoScheduler> scheduler_:按道理来说,每个agent有一个调度类,也就是说这里应该是一个调度类的list,但是这里只有一个,我觉得应该是因为调度算法是FIFO,所以所有agent应该统一成一个调度类
-```
-
-#### FullFifoAgent(c)
-
-explain:继承FullAgent
-
-```
-FullFifoAgent(FifoConfig config):整个调度算法初始化的开始
-
-std::unique_ptr<FifoScheduler> scheduler_:按道理来说,每个agent有一个调度类,也就是说这里应该是一个调度类的list,但是这里只有一个,我觉得应该是因为调度算法是FIFO,所以所有agent应该统一成一个调度类
-```
-
-#### AgentProcess
-explain:一个地址空间一模一样的父子进程,负责运行FullAgent
-
-```
-AgentProcess(AgentConfig config):构造出父子两进程,父进程负责幕后,子主线程负责创建agent线程并且等待退出,config将传给FullAgent的构造方法
-
-std::unique_ptr<ForkedProcess> agent_proc_:fork系统调用的封装
-std::unique_ptr<FullAgent> full_agent_:被运行的FullAgent
-std::unique_ptr<SharedBlob> sb_:父子进程共享此内存进行通信
-```
-
-
-## Task
-
-#### Task
-
-explain:代表一次要被调度上cpu的任务,被调度算法继承,如FifoTask
-
-```
-Gtid gtid:被调度task所代表线程的gtid
-LocalStatusWord status_word:和内核共享的seq等信息
-Seqnum seqnum:seq
-```
-
-#### TaskAllocator
-
-explain:存储task
-
-#### SingleThreadMallocTaskAllocator
-
-explain:继承TaskAllocator
-
-```
-
-```
-
-
-
-
-
-## Sheduler
-
-#### Sheduler
-
-explain:做调度决策的类,和agent应该是M对N?
-
-```
-Scheduler(Enclave* enclave, CpuList cpus):将当前调度类加入enclave中
-EnclaveReady():TODO
-DiscoverTasks():TODO
-GetDefaultChannel()
-GetAgentChannel(const Cpu& cpu)
-
-Enclave* const enclave_
-CpuList cpus_;
-```
-
-#### BasicDispatchScheduler
-
-explain:继承Sheduler,一种调度器实现,能够解码原始消息(来自channel),将它们与任务派生类型相关联,并调度到适当的调度类方法。其他的调度算法会继承这个类,如FifoScheduler
-
-```
-BasicDispatchScheduler(Enclave* enclave, CpuList cpus, std::shared_ptr<TaskAllocator<TaskType>> allocator)
-void DispatchMessage(const Message& msg) :将消息根据类型进行相应处理
-
-相应处理方法:交给对应调度类去实现
-CpuTick(const Message& msg)
-CpuNotIdle(const Message& msg) 
-CpuTimerExpired(const Message& msg) 
-CpuAvailable(const Message& msg) 
-CpuBusy(const Message& msg) 
-AgentBlocked(const Message& msg) 
-AgentWakeup(const Message& msg) 
-
-std::shared_ptr<TaskAllocator<TaskType>> const allocator_
-```
-
-
-
-
-
-
-
-
-
-
-## Enclave
-
-#### Enclave
-
-explain:论文中的enclave,上面包含运行的agent和scheduler,cpu拓扑
-
-```
-Enclave(const AgentConfig config):通过agentconfig去构造enclave
-GetRunRequest(const Cpu& cpu)获取指定cpu上的runrequest
-CommitRunRequest(RunRequest* req):commit此runrequest,底层调用ghost的接口
-SubmitRunRequest(RunRequest* req):submit此runrequest,底层调用ghost的接口
-CompleteRunRequest(RunRequest* req):complete此runrequest,底层调用ghost的接口,和上面那个接口配合使用估计
-LocalYieldRunRequest(const RunRequest* req, BarrierToken agent_barrier, int flags):agent结束在当前cpu上的调度
-Ready():必须在当前enclave上的所有agent和所有scheduler被构造后才能调用
-WaitForOldAgent():如果有一个老agent还在此enclave上,等待直到它退出
-AttachAgent(const Cpu& cpu, Agent* agent)
-void DetachAgent(Agent* agent)
-AttachScheduler(Scheduler* scheduler)
-DetachScheduler(Scheduler* scheduler)
-
-const AgentConfig config_:代表本enclave相关参数
-Topology* topology_:机器的cpu拓扑
-CpuList enclave_cpus_:本enclave包含的cpu!!!
-std::list<Scheduler*> schedulers_:在enclave上运行的schedulers
-std::list<Agent*> agents_:在enclave上运行的agent
-```
-
-#### LocalEnclave
-
-explain:继承enclave,不能再被继承
-
-```
-MakeChannel(int elems, int node, const CpuList& cpulist)
-struct CpuRep {
-   Agent* agent;
-   LocalRunRequest req;
-}
-CpuRep cpus_[MAX_CPUS]:cpu与其一一对应的agent,runrequest
-ghost_cpu_data* data_region_:内核共享通信区域
-size_t data_region_size_
-int dir_fd_ = -1:enclave相当于目录
-int ctl_fd_ = -1:控制enclave的fd
-```
-
-## RunRequest
-
-
-#### RunRequestOptions 
-explain: commit一个txn的参数
-
-```
-Gtid target = Gtid(0) //the task to run next
-BarrierToken target_barrier //Tseq
-BarrierToken agent_barrier = StatusWord::NullBarrierToken() // Aseq
-int commit_flags = 0 // controls how a transaction is committed
-int run_flags = 0  // control a variety of side-effects when the task either gets oncpu or offcpu
-```
-
-
-#### RunRequest
-
-explain:代表一次commit的请求容器,容器中装的是task,和一个cpu一一对应,底层将调用GhostHelper()->Run()提交本次请求
-
-```
-Init(Enclave* enclave, const Cpu& cpu):初始化runrequest,其所在的enclave和一一对应的cpu
-Open(const RunRequestOptions& options):开启一个即将要提交的事务,相关参数位于options
-void OpenUnschedule()   
-void LocalYield(const BarrierToken agent_barrier, const int flags) :Agent must call LocalYield when it has nothing to do
-bool Ping() :懂得都懂
-bool Commit() :对ghost commit的封装
-bool Submit() :对ghost submit的封装
-
-
-Enclave* enclave_
-Cpu cpu_;
-```
-
-#### LocalRunRequest
-
-explain:继承runrequest
-
-```
-Init(Enclave* enclave, const Cpu& cpu, ghost_txn* txn):初始化,但是多了代表事务相关信息的txn
-
-ghost_txn* txn_:代表事务相关信息的txn
-```
-
-
-
-
-
-
-
-
-
-## Cpu
-
-#### Cpu
-
-explain:一个cpu的相关信息,如L3Cache,NUMA等
-
-```
-struct CpuRep {
-    int cpu;
-    int core;
-    int smt_idx;
-    std::unique_ptr<CpuList> siblings;
-    std::unique_ptr<CpuList> l3_siblings;
-    int numa_node;
-}
-const CpuRep* rep_:cpu相关信息
-```
-
-#### CpuMap
-explain:TODO一个代表指导cpu是否被设置的位图
-
-#### CpuList
-explain:继承CpuMap
-
-
-#### Topology.h
-explain:代表机器的cpu拓扑信息(我觉得应该是代表整个机器的,而不是一个enclave的,也就是说整个机器的cpu信息都在这里面)
-
-```
-const uint32_t num_cpus_:cpu个数
-CpuList all_cpus_:cpu位图
-std::vector<Cpu::CpuRep> cpus_:所有cpu信息
-int highest_node_idx_:numa节点个数
-std::vector<CpuList> cpus_on_node_:各个numa节点的cpu
-```
-
-
-
-
-
-
-
-
-## Message
-
-#### Message
-
-explain:消息队列中的消息,被存放在消息队列中等待agent或者kernel去消费
-
-```
-struct ghost_msg {
-	uint16_t type;		/* message type */
-	uint16_t length;	/* length of this message including payload */
-	uint32_t seqnum;	/* sequence number for this msg source */
-	uint32_t payload[0];	/* variable length payload */
-};
-
-ghost_msg* msg_
-```
-
-
-## Channel
-
-#### Channel
-
-explain:消息队列,存放消息,基于共享内存,和Cpu关系 TODO
-
-```
-Peek():获取队首
-Consume(const Message& msg):弹出队首
-max_elements():环形队列大小
-AssociateTask(Gtid gtid, int barrier, int* status)TODO底层调用ghost的api
-SetEnclaveDefault():将当前channe设置为enclave的默认channelTODO
-```
-
-#### LocalChannel
-
-explain:继承Channel
-
-```
-LocalChannel(int elems, int node, CpuList cpulist):底层调用ghost的api
-struct ghost_queue_header {
-	uint32_t version;	/* ABI version */
-	uint32_t start;		/* offset from the header to start of ring */
-	uint32_t nelems;	/* power-of-2 size of ghost_ring.msgs[] */
-} 
-int fd_:消息队列的fd
-ghost_queue_header* header_:队头
-```
-
-
-## StatusWord
-
-#### StatusWord
-
-explain:和内核通信的共享内存,存储Tseq,Aseq等信息
-
-```
-typedef uint32_t BarrierToken
-
-struct ghost_sw_info {
-	uint32_t id;		/* status_word region id */
-	uint32_t index;		/* index into the status_word array */
-};
-
-struct ghost_status_word {
-	uint32_t barrier;
-	uint32_t flags;
-	uint64_t gtid;
-	int64_t switch_time;	/* time at which task was context-switched onto CPU */
-	uint64_t runtime;	/* total time spent on the CPU in nsecs */
-}
-
-ghost_sw_info sw_info_:sw的id和index
-ghost_status_word* sw_:sw信息
-```
-
-#### LocalStatusWord
-
-explain:上面的继承
-
-#### StatusWordTable
-
-explain:存储statusword的一块内存区域(ps:这里我觉得有必要将statuswordtable和channel来一个对比,它们都是和内核共享数据结构)
-
-```
-size_t map_size_ = 0;
-ghost_sw_region_header* header_ = nullptr;
-ghost_status_word* table_ = nullptr;
-```
-
-#### LocalStatusWordTable
-
-
-
-
-
-
-
-## Ghost
-
-#### Ghost
-
-explain:ghost内核相关接口的封装
-
-```
-原系统调用:
-Run(const Gtid& gtid, BarrierToken agent_barrier, BarrierToken task_barrier, const Cpu& cpu, int flags):LocalYieldRunRequest和PingRunRequest,这两者是干啥呢
-SyncCommit(cpu_set_t& cpuset):SubmitSyncRequests
-Commit(cpu_set_t& cpuset):SubmitRunRequests
-CreateQueue(int elems, int node, int flags, uint64_t& mapsize):LocalChannel的构造方法
-ConfigQueueWakeup(int queue_fd, const CpuList& cpulist, int flags):LocalChannel的构造方法
-AssociateQueue(int queue_fd, ghost_type type, uint64_t arg, BarrierToken barrier, int flags):LocalChannel的AssociateTask
-SetDefaultQueue(int queue_fd):SetEnclaveDefault
-GetStatusWordInfo(ghost_type type, uint64_t arg, ghost_sw_info& info):LocalStatusWord(StatusWord::AgentSW)
-
-SchedGetAffinity(const Gtid& gtid, CpuList& cpulist):cfs
-SchedSetAffinity(const Gtid& gtid, const CpuList& cpulist):rocksdb?
-SchedTaskEnterGhost(int64_t pid, int dir_fd)
-SchedAgentEnterGhost(int ctl_fd, const Cpu& cpu, int queue_fd)  :Makes calling thread the ghost agent on `cpu`.
-```
-
-#### GhostSignals 
-
-explain:ghost线程相关信号处理(不怎么涉及?先不管)
-
-#### GhostThread
-
-explain:原生线程的封装,可以决定被cfs还是ghost调度。和enclave的关系犹如目录与文件
-
-```
-int tid_;
-Gtid gtid_;
-KernelScheduler ksched_:ghost还是cfs
-Notification started_:线程开始运行,则这个将被唤醒
-std::thread thread_:线程
-```
-
-
-
-
-
diff --git a/record/dzh/ghost_paper_record.md b/record/dzh/ghost_paper_record.md
deleted file mode 100644
index 3ec71ad706672f9c980825f628c38b411d38ae17..0000000000000000000000000000000000000000
--- a/record/dzh/ghost_paper_record.md
+++ /dev/null
@@ -1,358 +0,0 @@
-# ghost论文阅读笔记
-
-
-## 概要
-
-+ 现如今争对使用场景对内核调度策略进行修改,在性能方面可以得到很大提升
-
-+ 但是为单一使用场景定制特定调度策略的内核是不切实际的,而且还涉及到重启内核,这会导致性能,可用性大大降低
-
-+ ghost是一个这样的用户态调度框架,通过用户态agent和内核通信,能够实时灵活定制用户想要的复杂调度策略而不需要重启内核,并且适应性广泛,无论是percpu还是centralized
-
-+ 使用ghost能够增大吞吐量,减少延迟,同时为数据中心工作负载启用策略优化、非中断升级和故障隔离。
-
-
-## 1.介绍
-
-+ 许多特定场景的调度策略:
-
-    - Shinjuku request scheduler
-    - Tableau scheduler
-    - Caladan scheduler
-
-+ 在大型工程中部署特定调度策略难度极大,很可能造成内核崩溃,就算部署成功,对内核升级也需要停机
-
-+ 以前的用户态调度框架设计有明显缺点:对应用部署需要修改;需要专门的资源时期可以高响应;需要针对应用来特定修改内核
-
-+ 硬件环境变化
-
-+ The goal of ghOSt is to fundamentallychange how scheduling policies are designed, implemented, and deployed. ghOSt provides the agility of userspace development and ease of deployment, while still enabling 𝜇s-scale scheduling
-
-+ agent是一个os进程,通过相关系统调用与内核通信
-
-+ 内核通过异步消息队列告诉agent它管理的线程状态变化
-
-+ agent通过内核传递的消息同步地告诉内核调度策略的转变
-
-+ ghost支持并发执行多个调度策略
-
-+ ghost相关通信和调度时长很可观
-
-
-### 2.背景与设计目标
-
-##### 背景
-
-+ linux目前采用cfs调度策略,很难针对特定场景进行针对性优化
-
-+ 实现内核调度策略很难
-
-+ 部署内核调度策略更难
-
-+ 用户态线程的调度策略是不够的,归根结底它还是受制于内核调度策略
-
-+ 为特定场景定制内核也是不切实际
-
-+ 通过ebpf去定制调度策略?也不是很适合
-    - ebfp受到诸多限制,如栈大小,循环次数,访问内核数据受限
-    - ebpf是同步的,在调度前需要阻塞
-
-##### 设计目标
-
-+ 容易实现和测试
-
-+ 效率高,易表达
-
-+ 不局限于per-CPU模型
-
-+ 支持多种并发策略
-
-+ 非中断更新(不需要重启)和错误隔离
-
-
-### 3.设计与实现
-
-##### 基本理念
-
-+ ghost概述
-
-    - 用户态agent通知内核如何进行调度
-
-    - 内核实现通过用户态信息实现一个类似于cfs的调度类 sheduling class
-
-    - 调度类提供用户态一组接口让用户态去定制调度策略
-
-    - 为了帮助用户态判断,内核将管理线程的状态通过消息和状态码传递给agent
-
-    - 而agent通过系统调用syscall和事务transaction通知内核调度策略
-
-+ percpu和centralized概念
-
-    - percpu:调度只管本cpu的调度,有steal策略
-
-    - centralized:全局调度
-
-+ cpu与线程的概念
-
-    - 线程:内核线程
-
-    - cpu:执行单元
-
-+ enclaves
-
-    - 支持在单机上执行多种调度策略
-
-    - 因地制宜分配cpu(如NUMA架构)
-
-+ ghost使用agent
-
-    - 用户态agent实现方便,调试简单
-
-    - 配置调度策略无需重启系统
-
-    - 对于percpu,都有一个agent对应,可以对每个cpu配置不同调度策略
-
-    - 对于centralized,全局agent对所有cpu调度负责,同时还有其他不活动的agent
-
-    - 所有agent通过内核线程的模式实现,他们同属于一个进程
-
-
-##### 内核到代理的通信
-
-+ 将所线程状态传递给agent
-
-    - 共享内存?
-
-    - 系统内存文件/proc/pid?
-
-    - API(消息队列)yes
-
-+ message消息
-
-    - THREAD_CREATED
-    - THREAD_BLOCKED
-    - THREAD_PREEMPTED
-    - THREAD_YIELD
-    - THREAD_DEAD
-    - THREAD_WAKEUP
-    - THREAD_AFFINITY(线程绑核)
-    - TIMER_TICK(确保agent基于最新状态做决定)
-
-+ mq消息队列
-
-    - 组织方式:共享内存中使用自定义队列
-
-    - percpu每个cpu和agent间有一个初始mq
-
-    - centralized所有cpu和全局agent间有一个初始mq
-
-
-
-+ 线程和mq间组织方式 
-
-    - CREATE/DESTROY_QUEUE:创建/摧毁mq
-
-    - ASSOCIATE_QUEUE:修改线程msg和mq之间的发送关系
-
-+ mq和agent间组织方式
-
-    - CONFIG_QUEUE_WAKEUP:自定义msg到来时,对agent的唤醒后的行为(centralized没有配置,因为全局agent不能被阻塞)
-
-+ 在mq/cpu间移动线程
-
-    - ASSOCIATE_QUEUE:修改线程msg和mq之间的发送关系,失败场景:试图移动的线程还有msg在当前mq没有处理
-
-+ 在agents和kernel间同步
-
-    - 在agent做调度策略决定的时候,可能又有新的msg到来(后面讲述如何解决)
-
-    - Aseq和Tseq的递增条件
-
-+ 通过共享内存传递seq修改信息
-
-
-##### 代理到内核的通信
-
-+ agent通过transaction事务来和内核通信
-
-    - percpu:一个系统调用接口足矣,centralized:核如果很多,那么使用系统调用性能将下降,共享内存更合适,所以,最终采用共享内存方案
-
-    - TXN_CREATE
-
-    - TXNS_COMMIT:对于percpu,发生context swtich,意味着当前agent被替换为要运行的线程
-
-+ Group commits(批量提交)
-
-    - 对于centralized调度,单个提交会导致性能大大下降
-
-+ seq核事务
-
-    - 在agent做调度策略决定的时候,可能又有新的msg到来(后面讲述如何解决),并且该msg可能来自高优先级线程,当前agent处于running,无法唤醒通知
-
-    - 1)Read Aseq
-
-    - 2)读取msq
-
-    - 3)决定调度策略
-
-    - 4)commit,若commit的最新Aseq比内核观测到的最新Aseq小,那么commit失败
-
-+ 通过ebpf加速
-
-    - cpu空闲,但是agent没有调度线程时,ebpf会选择线程运行
-
-##### centralized 调度
-
-+ 避免全局agent线程被抢占
-
-    - 全局agent优先级最高,无论ghost还是非ghost,没有任何线程能抢占
-
-    - 造成负面影响:每个线程存在绑定的工作线程
-
-    - 通过切换到inactive 的agent解决
-
-+ sql和centralized调度
-
-    - 判断Tseq是否为最新
-
-
-##### 故障隔离与动态升级
-
-+ 和内核其他调度策略的关系
-
-    - 优先级低于内核原生调度类,如cfs
-
-+ 动态更新与回滚
-
-    - 替换agent,保留enclave:新旧agent
-
-    - 摧毁enclave,从头开始:摧毁当前enclave下所有agent,相关线程送回内核默认调度
-
-
-+ 看门狗
-
-    - 摧毁不进行线程调度的enclave
-
-
-### 4.评估和对比
-
-三个问题:
-
-+ ghost相比于其他调度器有啥额外开销
-
-+ 和之前的调度器相比
-
-+ ghost是解决大规模低延迟工作负载,比如 Google Snap, Google Search和virtual machines的可行方案吗
-
-##### ghost的开销
-
-+ 代码量:少,而且高级语言通过调库使得代码量更少
-
-+ 消息传递开销
-
-+ 本地调度开销(percpu)
-
-+ 远程调度开销(centralized)
-    每秒每个cpu可以25200个线程(100个cpu),线程40us,能让所有cpu繁忙。
-    随着agent个数增多,这个数据也是线性增长
-
-+ 全局agent性能分析:全局agent调度其他线程,统计cpu数量和commit个数的关系
-    - 第一次drop:工作线程和全局代理竞争
-    - 第二次drop:核间通信开销大于核增加开销
-
-##### 和其他定制的centralized调度类比较
-
-+ 配置
-
-    - 初始Shinjuku:调度线程绑物理核,20个工作线程绑逻辑核(别的线程不能在此核上运行)
-    - 基于ghost:710行代码,应该全局agent线程,200个工作线程(都不绑核,所以允许空闲cpu调度其他线程)
-    - 非抢占基于cfs的Shinjuku
-
-    
-+ 单一调度:只调度rocksdb
-    - ghost比初始代码量少82%
-    - 延迟ghost稍微高
-    - 饱和吞吐量相差不到5%
-    - 原因:对于每次请求,ghost都得调度一次;初始只要在20个pin住的工作线程上传递调度请求即可
-
-    - cfs由于不能抢占,太费
-
-+ 多种调度:
-    - 初始由于不能控制其他cpu线程,当负载低的时候也不能将cpu放出
-    - 试Shenango?不适合执行时间时刻变化的场景,导致延迟相比于Shinjuku更久
-    - 吞吐量比较和单一调度差不多,但是cpushare,ghost可以将空闲cpu分享给其他线程
-
-
-个人总结:延迟ghost略逊,但是对于将cpu分享给其他线程这块,ghost做的更好,Shinjuku不会分享空闲cpu
-
-##### Google Snap
-
-+ 实时工作线程如何调度?
-    MicroQuanta保证在任何一个时间周期内(例如1ms)将cpu分享给指定工作线程:兼顾实时调度工作线程和不让其他线程挨饿(个人解读:寻找一个恰好合适的周期,能够兼顾二者)
-    
-
-+ 测试环境
-    - 6对客户-服务器模型线程,1个64b,另外5个64kb
-    - 两种实验测试:quiet只有工作线程;loaded还有其他40竞争线程
-
-+ ghost配置
-    - ghost的部署:全局agent线程管理工作线程和其他竞争线程,snap工作线程具有高优先级
-    - ghost管理的线程会被cfs调度的客户-服务器模型线程和原生线程抢占
-    - 竞争线程只会在空闲时被调度
-
-+ 延迟比较:
-    - quiet:64b ghost要好,64kb初始要好   64kb拷贝时间过多导致调度事件过少
-    - loaded: 同上,但是64kb ghost的差距没64b大
-
-个人总结:ghost能够灵活调度线程去空闲cpu,一个特别简单的ghost配置就能接近大量修改内核的MicroQuanta调度策略
-
-##### Google Search
-
-+ 测试环境
-
-    - cpu和内存 敏感
-    - SSD和内存敏感
-    - cpu敏感
-
-+ ghost配置
-
-    - 全局agent调度256个cpu,NUMA和CCX情况需要考虑,前者有自己内存,后者有自己L3cache
-
-+ QPS和延迟比较
-
-    - QPS二者差不多,二者都能针对NUMA和CCX情况优化,但是CFS需要重启
-
-    - 在延迟上,ghost明显好于cfs,除了第三种。CFS以毫秒级平衡线程调度,ghost则会在微秒级做出反应。对于第三种,也能通过重新制定调度策略优化
-
-+ 快速进行实验的经验
-
-    - 对内核修改后重新编译花费很长时间,每天只能干5次
-
-    - ghost不需修改重启内核,只需要1分钟
-
-个人总结:ghost低延迟,特定场景对空闲cpu的掌控能力使得其延迟很凶残
-
-##### 保护虚拟机,防止L1TF/MDS攻击
-
-ps:这两个攻击我都不懂,乐,大概是可以从运行在同一个超线程的另一个虚拟机中窃取数据。解决办法是确保每个虚拟机的每个虚拟cpu只运行在特定的物理核上。 Microarchitectural buffers在切换其他虚拟机的时候需要被清空
-
-+ per-core 调度:让一个核上运行相同虚拟机的cpu
-
-+ 性能,差不多,但是后两个要安全
-
-个人总结:针对攻击能够制定特定调度策避免被攻击
-
-### 5.未来工作
-
-+ 使用ebpf加速
-
-+ 关闭时间中断
-
-### 6.相关工作
-
-没啥好看的
-
-### 7.结论
-
-没啥好说的
-
diff --git a/record/dzh/google.md b/record/dzh/google.md
deleted file mode 100644
index 7bae08bfa1bf3669e8959cbb567c468cb5514345..0000000000000000000000000000000000000000
--- a/record/dzh/google.md
+++ /dev/null
@@ -1,77 +0,0 @@
-
-
-毫无疑问,有人会尝试将BPF引入内核的CPU调度器,这只是时间问题。在1月底,Tejun Heo与David Vernet、Josh Don和Barret Rhoden合作发布了30个补丁系列的第二版,旨在实现这一目标。将调度决
-策延迟到BPF程序中可能会有一些有趣的事情,但要让整个开发社区接受这个想法可能需要一些工作。
-
-BPF的核心思想是允许程序在运行时从用户空间加载到内核中;使用BPF进行调度具有潜力使得调度行为与目前在Linux系统中看到的有很大不同。“可插拔”的调度器概念并不是新鲜的;例如,在2004年的一
-次讨论中,Con Kolivas提出了一系列注定失败的补丁,其中涉及到可插拔的调度器。当时,这个可插拔调度器的想法受到了强烈的反对;因为只有将精力集中在单个调度器上,开发社区才能找到一种方
-式,满足所有工作负载,而不会将内核填满各种特殊目的的调度器的混乱。
-
-当然,内核只有一个CPU调度器的想法并不完全准确;实际上,还有几个调度器可供应用程序选择,包括实时调度器和截止时间调度器。但是,在Linux系统上几乎所有的工作都在默认的“完全公平调度器”
-下运行,它确实在各种从嵌入式系统到超级计算机的工作负载管理方面都做得很好。人们总是渴望更好的性能,但多年来几乎没有要求提供可插拔调度器机制的请求。
-
-那么,为什么现在提出BPF机制呢?为了避免长时间的讨论,这个补丁系列的说明信详细描述了这项工作的动机。简而言之,这个论点是,使用BPF编写调度策略极大地降低了尝试新的调度方法的难度。自完
-全公平调度器问世以来,我们的工作负载和运行它们的系统变得更加复杂;需要进行实验来开发适合当前系统的调度算法。BPF调度类可以以安全的方式进行实验,甚至无需重新启动测试机器。使用BPF编写
-的调度器还可以提高针对某些特定工作负载的性能,这些工作负载可能不值得在主线内核中支持,并且部署到大型系统集群中也更加容易。
-
-## Scheduling with BPF
-
-这个补丁集添加了一个名为SCHED_EXT的新调度类,可以通过类似于大多数其他调用sched_setscheduler()的调用来选择它(选择SCHED_DEADLINE有点更加复杂)。它是一个非特权类,这意味着任何进程
-都可以将自己置于SCHED_EXT中。SCHED_EXT被放置在优先级堆栈中的空闲类(SCHED_IDLE)和完全公平调度器(SCHED_NORMAL)之间。因此,没有SCHED_EXT调度器可以以一种阻止例如以SCHED_NORMAL
-运行的普通shell会话运行的方式接管系统。它还建议,在使用SCHED_EXT的系统上,期望大部分工作负载将在该类中运行。
-
-BPF编写的调度程序对整个系统是全局的;没有为不同的进程组加载自己的调度程序的规定。如果没有加载BPF调度程序,则放置在SCHED_EXT类中的任何进程将像在SCHED_NORMAL中一样运行。然而,一旦
-加载了BPF调度程序,它将接管所有SCHED_EXT任务的责任。还有一个神奇的函数,BPF调度程序可以调用(scx_bpf_switch_all()),它将所有运行在实时优先级以下的进程移动到SCHED_EXT中。
-
-实现调度程序的BPF程序通常会管理一组调度队列,每个队列都可能包含等待在CPU上执行的可运行任务。默认情况下,系统中每个CPU都有一个调度队列和一个全局队列。当CPU准备好运行新任务时,调度
-程序将从相应的调度队列中取出一个任务并将其分配给CPU。调度程序的BPF部分大多实现为一组通过操作结构调用的回调函数,每个回调函数通知BPF代码需要进行的事件或决策。该列表很长,完整的列表
-可以在SCHED_EXT存储库分支的include/sched/ext.h中找到。该列表包括:
-
-        当一个新的任务进入SCHED_EXT时,prep_enable()和enable()这两个回调函数将通知调度程序。prep_enable()可以用于为该任务设置任何相关数据,它可以阻塞并用于内存分配。enable()则
-        无法阻塞,它实际上启用了新任务的调度。
-
-        select_cpu()回调函数用于为刚刚唤醒的任务选择一个CPU,并返回要将任务放置在的CPU编号。这个决策可以在任务实际运行之前重新审视,但它可能被调度程序用于唤醒选择的CPU(如果它当
-        前处于空闲状态)。
-
-        enqueue()回调函数将一个任务加入调度程序以进行运行。通常,该回调将调用scx_bpf_dispatch()将任务放置到选择的调度队列中,该队列最终将在任务运行时为其提供时间片长度。如果将片
-        长指定为SCX_SLICE_INF,则在此任务运行时,CPU将进入无节拍模式。
-
-        值得注意的是,enqueue()不必将任务放入任何调度队列;如果任务不应立即运行,它可以将任务暂时放在某个地方。但内核会跟踪这些任务,以确保没有任务被遗忘;如果任务滞留时间过长(默
-        认为30秒,但超时时间可以缩短),BPF调度程序最终将被卸载。
-
-        当一个CPU的调度队列为空时,调用dispatch()回调函数将任务分派到该队列中以保持CPU忙碌。如果调度队列仍然为空,调度程序将尝试从全局队列中获取任务。
-
-        update_idle()回调函数将通知调度程序一个CPU何时进入或离开空闲状态。
-
-        runnable()、running()、stopping()和quiescent()回调函数分别通知调度程序任务的状态更改。它们分别在任务变为可运行、在CPU上开始运行、从CPU上被取下或变为不可运行时调用。
-
-        cpu_acquire()和cpu_release()回调函数通知调度程序系统中CPU的状态。当一个CPU对BPF调度程序可用时,回调函数cpu_acquire()将通知它这个事实。当一个CPU不可用时(例如,一个实时
-        调度类可能已经使用它),将通过调用cpu_release()来通知它。
-
-
-还有许多其他的回调函数用于控制组的管理、CPU亲和性、核心调度等。此外,还有一组函数可供调度程序调用以影响调度决策;例如,scx_bpf_kick_cpu() 可用于抢占正在给定CPU上运行的任务,并回
-调调度程序以选择在该CPU上运行的新任务。
-
-## Examples
-
-最终的结果是一个框架,允许在 BPF 代码中实现各种调度策略。为了证明这一点,这个补丁系列包含了许多示例调度器。其中一部分是一个最小的“虚拟”调度器,它使用默认的回调函数;另一个则是一个
-基本调度器,实现了五个优先级级别,并展示了如何将任务存储到 BPF 映射中。“虽然不是很实用,但它作为一个简单的示例很有用,并将用于演示不同的功能”。
-
-此外,还有一个“中央”调度程序,它将一个CPU专用于调度决策,使得所有其他CPU都可以自由运行工作负载。后续的补丁为该调度程序添加了tickless支持,并总结道:
-
-        尽管 scx_example_central本身不足以用作生产调度程序,但可以使用相同的方法构建更具特色的中央调度程序。Google 的经验表明,这种方法对某些应用程序(如 VM 托管)具有重要的好处
-
-此外,scx_example_pair 采用控制组实现了一种核心调度形式。scx_example_userland 调度程序“在用户空间实现了一个相当不成熟的排序列表 vruntime 调度程序,以演示大多数调度决策如何委托给
-用户空间”。该系列最后介绍了 Atropos 调度程序,它具有用 Rust 编写的重要的用户空间组件。信件封面还介绍了另一个调度程序 scx_example_cgfifo,因为它依赖于仍未合并到主线的 BPF rbtree 
-补丁而未被包含在该系列中。它“为各个工作负载提供 FIFO 策略,并提供扁平化分层 vtree 用于控制组”,显然在 Apache Web 服务基准测试中比 SCHED_NORMAL 提供更好的性能。
-
-## Prospects
-
-这个补丁集目前已经发布了第二个版本,并且迄今为止还没有引起很多评论,也许太大了,无法进行辩论。然而,调度器维护者Peter Zijlstra在第一个版本中回应说:“我讨厌所有这些。Linus在过去多
-次否决了可加载的调度器,这只是又一次——加上了整个BPF问题的额外缺陷。”然而,他继续审查了许多组成补丁,这表明他可能不打算完全拒绝这项工作。
-
-BPF调度器类显然是核心内核社区难以接受的重要改动。它增加了超过10,000行的核心代码,并公开了许多迄今为止被深深隐藏在内核中的调度细节。这将承认一个通用调度器无法最优地服务于所有工作负
-载。一些人可能担心这将标志着完全公平调度器的工作结束,并增加Linux系统的碎片化。BPF调度器的开发人员则持相反的观点,认为能够自由实验调度模型,将加速完全公平调度器的改进。
-
-这个子系统的最终结果如何还很难预测,但可以指出的是,迄今为止,BPF巨头已经成功地克服了几乎所有遇到的反对意见。在内核中锁定核心功能的时代似乎正在结束。看到这个子系统将会开启哪些新的
-调度方法将会是很有趣的。
diff --git a/record/dzh/huawe.md b/record/dzh/huawe.md
deleted file mode 100644
index c1c847d36a95dd0c785ae757526dd92c90c62615..0000000000000000000000000000000000000000
--- a/record/dzh/huawe.md
+++ /dev/null
@@ -1,22 +0,0 @@
-你好Roman和列表成员,
-
-我们希望实现一个可编程的调度器,以满足不同工作负载的调度需求。
-
-使用BPF,我们可以轻松地为特定工作负载部署调度策略,快速验证,无需修改内核代码。这大大降低了在生产环境中部署新调度策略的成本。
-
-因此,我们希望在您的补丁的基础上继续开发。我们计划将其合并到openeuler开源社区中,并利用社区不断演进和维护它。
-(链接:https://www.openeuler.org/en/)
-
-我们对您的补丁进行了一些更改:
-1.适应openeuler-OLK-5.10分支,该分支大多基于长期支持的Linux分支5.10。
-2.引入Kconfig CONFIG_BPF_SCHED以在编译时隔离相关代码。
-3.修改了helpers bpf_sched_entity_to_cgrpid()和bpf_sched_entity_belongs_to_cgrp(),通过se->my_q->tg->css.cgroup获取调度实体所属的任务组。
-
-我们有一些关于下一次Scheduler BPF迭代的想法,想与您分享:
-1.在struct task_struct和struct task_group中添加tag字段。用户可以使用文件系统接口为特定工作负载标记不同的标签。bpf prog获取标签以检测不同的工作负载。
-2.添加BPF hook和helper来调度进程,如select_task_rq和pick_next_task,以实现可扩展性。
-
-这是一个新的尝试,后面肯定会有很多问题,但让调度器可编程是令人兴奋的。
-
-祝好,
-任志杰
diff --git a/sched/Makefile b/sched/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..fa9430ec0851787f5e645b650a0ac04d21dff2f1
--- /dev/null
+++ b/sched/Makefile
@@ -0,0 +1,200 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+include ../../build/Build.include
+include ../../scripts/Makefile.arch
+include ../../scripts/Makefile.include
+
+ifneq ($(LLVM),)
+ifneq ($(filter %/,$(LLVM)),)
+LLVM_PREFIX := $(LLVM)
+else ifneq ($(filter -%,$(LLVM)),)
+LLVM_SUFFIX := $(LLVM)
+endif
+
+CLANG_TARGET_FLAGS_arm          := arm-linux-gnueabi
+CLANG_TARGET_FLAGS_arm64        := aarch64-linux-gnu
+CLANG_TARGET_FLAGS_hexagon      := hexagon-linux-musl
+CLANG_TARGET_FLAGS_m68k         := m68k-linux-gnu
+CLANG_TARGET_FLAGS_mips         := mipsel-linux-gnu
+CLANG_TARGET_FLAGS_powerpc      := powerpc64le-linux-gnu
+CLANG_TARGET_FLAGS_riscv        := riscv64-linux-gnu
+CLANG_TARGET_FLAGS_s390         := s390x-linux-gnu
+CLANG_TARGET_FLAGS_x86          := x86_64-linux-gnu
+CLANG_TARGET_FLAGS              := $(CLANG_TARGET_FLAGS_$(ARCH))
+
+ifeq ($(CROSS_COMPILE),)
+ifeq ($(CLANG_TARGET_FLAGS),)
+$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk
+else
+CLANG_FLAGS     += --target=$(CLANG_TARGET_FLAGS)
+endif # CLANG_TARGET_FLAGS
+else
+CLANG_FLAGS     += --target=$(notdir $(CROSS_COMPILE:%-=%))
+endif # CROSS_COMPILE
+
+CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as
+else
+CC := $(CROSS_COMPILE)g++
+endif # LLVM
+
+CURDIR := $(abspath .)
+TOOLSDIR := $(abspath ../..)
+LIBDIR := $(TOOLSDIR)/lib
+COS_LIBDIR := $(CURDIR)/../lib
+BPFDIR := $(LIBDIR)/bpf
+TOOLSINCDIR := $(TOOLSDIR)/include
+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
+APIDIR := $(TOOLSINCDIR)/uapi
+GENDIR := $(abspath ../../../include/generated)
+GENHDR := $(GENDIR)/autoconf.h
+LLVM_INCLUDE := /home/shootfirst/llvm-project/build/lib/clang/17/include/
+
+SCRATCH_DIR := $(CURDIR)/tools
+BUILD_DIR := $(SCRATCH_DIR)/build
+INCLUDE_DIR := $(SCRATCH_DIR)/include
+BPFOBJ_DIR := $(BUILD_DIR)/libbpf
+BPFOBJ := $(BPFOBJ_DIR)/libbpf.a
+ifneq ($(CROSS_COMPILE),)
+HOST_BUILD_DIR		:= $(BUILD_DIR)/host
+HOST_SCRATCH_DIR	:= host-tools
+HOST_INCLUDE_DIR	:= $(HOST_SCRATCH_DIR)/include
+else
+HOST_BUILD_DIR		:= $(BUILD_DIR)
+HOST_SCRATCH_DIR	:= $(SCRATCH_DIR)
+HOST_INCLUDE_DIR	:= $(INCLUDE_DIR)
+endif
+HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a
+RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids
+DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool
+
+VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux)					\
+		     $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux)		\
+		     ../../../vmlinux						\
+		     /sys/kernel/btf/vmlinux					\
+		     /boot/vmlinux-$(shell uname -r)
+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
+ifeq ($(VMLINUX_BTF),)
+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)")
+endif
+
+BPFTOOL ?= $(DEFAULT_BPFTOOL)
+
+ifneq ($(wildcard $(GENHDR)),)
+  GENFLAGS := -DHAVE_GENHDR
+endif
+
+CFLAGS += -g -O2 -rdynamic -pthread -std=c++11 -Wall -Werror $(GENFLAGS)			\
+	  -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR)			\
+	  -I$(TOOLSINCDIR) -I$(APIDIR) -I$(COS_LIBDIR)
+
+CARGOFLAGS := --release
+
+# Silence some warnings when compiled with clang
+ifneq ($(LLVM),)
+CFLAGS += -Wno-unused-command-line-argument
+endif
+
+LDFLAGS = -lelf -lz -lpthread
+
+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null |				\
+			grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
+
+# Get Clang's default includes on this system, as opposed to those seen by
+# '-target bpf'. This fixes "missing" files on some architectures/distros,
+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+define get_sys_includes
+$(shell $(1) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
+$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
+endef
+
+BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH)					\
+	     $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)		\
+	     -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) -I$(LLVM_INCLUDE)	-I$(COS_LIBDIR)			\
+	     -I../../../include							\
+	     $(call get_sys_includes,$(CLANG))					\
+	     -Wall -Wno-compare-distinct-pointer-types				\
+	     -O2 -mcpu=v3
+
+all: scx_sjf scx_mfq
+
+# sort removes libbpf duplicates when not cross-building
+MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf		\
+	       $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids	\
+	       $(INCLUDE_DIR))
+
+$(MAKE_DIRS):
+	$(call msg,MKDIR,,$@)
+	$(Q)mkdir -p $@
+
+$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)			\
+	   $(APIDIR)/linux/bpf.h						\
+	   | $(BUILD_DIR)/libbpf
+	$(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/	\
+		    EXTRA_CFLAGS='-g -O0 -fPIC'					\
+		    DESTDIR=$(SCRATCH_DIR) prefix= all install_headers
+
+$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile)	\
+		    $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool
+	$(Q)$(MAKE) $(submake_extras)  -C $(BPFTOOLDIR)				\
+		    ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD)		\
+		    EXTRA_CFLAGS='-g -O0'					\
+		    OUTPUT=$(HOST_BUILD_DIR)/bpftool/				\
+		    LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/			\
+		    LIBBPF_DESTDIR=$(HOST_SCRATCH_DIR)/				\
+		    prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install-bin
+
+$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR)
+ifeq ($(VMLINUX_H),)
+	$(call msg,GEN,,$@)
+	$(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
+else
+	$(call msg,CP,,$@)
+	$(Q)cp "$(VMLINUX_H)" $@
+endif
+
+%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h scx_common.bpf.h user_exit_info.h	\
+	| $(BPFOBJ)
+	$(call msg,CLNG-BPF,,$@)
+	$(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@
+
+%.skel.h: %.bpf.o $(BPFTOOL)
+	$(call msg,GEN-SKEL,,$@)
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o)
+	$(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o)
+	$(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o)
+	$(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(notdir $(<:.bpf.o=)) > $@
+	$(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(notdir $(<:.bpf.o=)) > $(@:.skel.h=.subskel.h)
+
+scx_sjf: $(CURDIR)/sjf/scx_sjf.c $(CURDIR)/sjf/scx_sjf.skel.h	\
+		      $(CURDIR)/sjf/scx_sjf_common.h user_exit_info.h
+	$(CC) $(CFLAGS) -c $< -o $@.o
+	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
+
+scx_mfq: $(CURDIR)/mfq/scx_mfq.c $(CURDIR)/mfq/scx_mfq.skel.h	\
+		      $(CURDIR)/mfq/scx_mfq_common.h user_exit_info.h
+	$(CC) $(CFLAGS) -c $< -o $@.o
+	$(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS)
+
+
+clean:
+	rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)
+	rm -f sjf/*.o sjf/*.bpf.o sjf/*.skel.h sjf/*.subskel.h *.o
+	rm -f scx_sjf
+	rm -f mfq/*.o mfq/*.bpf.o mfq/*.skel.h mfq/*.subskel.h *.o
+	rm -f scx_mfq
+
+install:
+	sudo mkdir -p /etc/cos/shm
+
+.PHONY: all clean install
+
+# delete failed targets
+.DELETE_ON_ERROR:
+
+# keep intermediate (.skel.h, .bpf.o, etc) targets
+.SECONDARY:
diff --git a/sched/mfq/mfq_sched.h b/sched/mfq/mfq_sched.h
new file mode 100644
index 0000000000000000000000000000000000000000..3bccac920c23a91d52deac9d690e7af0b2ef228d
--- /dev/null
+++ b/sched/mfq/mfq_sched.h
@@ -0,0 +1,168 @@
+#include <vector>
+#include <queue>
+#include <cassert>
+#include <iostream>
+#include <chrono>
+
+#define MS2NS 1 * 1000 * 1000
+
+
+/* The data structure containing tasks that are enqueued in user space. */
+struct enqueued_task {
+	__u64 sum_exec_runtime;
+	__s32 queue_id; 
+};
+
+std::time_t getTimeStamp()
+{
+    std::chrono::time_point<std::chrono::system_clock,std::chrono::nanoseconds> tp = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now());//获取当前时间点
+    std::time_t timestamp =  tp.time_since_epoch().count(); 
+    return timestamp;
+}
+
+
+class MFQSched {
+private:
+	// number of queue
+	static const size_t queue_num_ = 5;
+
+	// per queue size
+	const std::vector<size_t> queue_sizes_ = {1000, 2000, 4000, 6000, 10000}; 
+
+	// per queue thread max running time, the last queue running time is infinite
+	const std::vector<__u64> max_time_ = {1 * MS2NS, 3 * MS2NS, 10 * MS2NS, 20 * MS2NS, 0}; 
+
+	// maximum interval time per queue is not accessed
+	const std::vector<__u64> max_gap_ = {0, 6 * MS2NS, 20 * MS2NS, 40 * MS2NS, 100 * MS2NS}; 
+
+	// queues to store thread info
+	std::vector<std::queue<enqueued_task*>> mfq_;
+
+	// the latest time per queue being accessed
+	std::vector<__u64> latest_sched_time_;
+
+	// number of thread to pop per sched time
+	size_t total_sched_num_;
+
+	// number of thread to pop per sched time for normal sched
+	size_t normal_sched_num_;
+
+
+public:
+
+	MFQSched() {}
+
+	MFQSched(size_t total_sched_num) : total_sched_num_(total_sched_num) {
+		
+		normal_sched_num_ = total_sched_num - total_sched_num / 4;
+
+		__u64 now = (__u64)getTimeStamp();
+		printf("MFQ init! now is %llu\n", now);
+
+		latest_sched_time_ = std::vector<__u64>(queue_num_);
+		for (size_t i = 0; i < queue_num_; i++) {
+			latest_sched_time_[i] = now;
+		}
+
+		mfq_ = std::vector<std::queue<enqueued_task*>>(queue_num_);
+		for (size_t i = 0; i < queue_num_; i++) {
+			mfq_[i] = std::queue<enqueued_task*>();
+		}
+
+	}
+
+	bool Enqueue(enqueued_task* task) {
+		assert((size_t)task->queue_id < queue_num_);
+
+		// decide which queue to push 
+		size_t queue_id = 0;
+		for (; queue_id < queue_num_ - 1; queue_id++) {
+			// printf("task->sum_exec_runtime %llu, queue %lu max_time %llu\n", task->sum_exec_runtime, queue_id, max_time_[queue_id]);
+			if (task->sum_exec_runtime < max_time_[queue_id]) {
+				break;
+			}
+		}
+		task->queue_id = queue_id;
+
+		// target queue is full
+		if (mfq_[task->queue_id].size() == queue_sizes_[task->queue_id]) {
+			return false;
+		}
+
+		// push the thread to target queue
+		mfq_[task->queue_id].push(task);
+		
+		return true;
+	}
+
+	std::vector<enqueued_task*> Schedule() {
+
+		__u64 now = (__u64)getTimeStamp();
+
+		// 1. find queue to schedule
+		size_t chosen_queue = 0;
+		for (; chosen_queue < queue_num_; chosen_queue++) {
+			if (!mfq_[chosen_queue].empty()) {
+				break;
+			}
+		}
+
+		// // 2. find max gap time queue
+		size_t max_gap_queue = queue_num_;
+		size_t max_gap_time = 0;
+		for (auto i = chosen_queue + 1; i < queue_num_; i++) {
+			if (max_gap_time < now - latest_sched_time_[i - 1] && !mfq_[i - 1].empty()) {
+				max_gap_time = now - latest_sched_time_[i - 1];
+				max_gap_queue = i;
+			}
+		}
+
+		// printf("chosen_queue is %lu, max_gap_queue is %lu\n",chosen_queue, max_gap_queue);
+
+		// 3. process by case
+		std::vector<enqueued_task*> ans(0);
+
+		// 3.1 all the queue is empty
+		if (chosen_queue >= queue_num_) {
+			return ans;
+
+		// 3.2 has chosen queue, but no starvation queue
+		} else if (max_gap_queue >= queue_num_) {
+			printf("here\n");
+			// a.drain out chosen queue
+			size_t cnt = std::min(mfq_[chosen_queue].size(), total_sched_num_);
+			for (size_t i = 0; i < cnt; i++) {
+				ans.push_back(mfq_[chosen_queue].front());
+				mfq_[chosen_queue].pop();
+			}
+
+			// b. update time
+			latest_sched_time_[chosen_queue] = now;
+
+		// 3.3 has both 
+		} else if (max_gap_queue <= queue_num_) {
+
+			// a. chosen queue
+			size_t cnt = std::min(mfq_[chosen_queue].size(), normal_sched_num_);
+			for (size_t i = 0; i < cnt; i++) {
+				ans.push_back(mfq_[chosen_queue].front());
+				mfq_[chosen_queue].pop();
+			}
+
+			// b. starvation queue
+			size_t starvation_cnt = std::min(mfq_[max_gap_queue].size(), total_sched_num_ - cnt);
+			for (size_t i = 0; i < starvation_cnt; i++) {
+				ans.push_back(mfq_[max_gap_queue].front());
+				mfq_[max_gap_queue].pop();
+			}
+
+			// c. update time
+			latest_sched_time_[chosen_queue] = now;
+			latest_sched_time_[max_gap_queue] = now;
+		}
+
+		return ans;
+
+	}
+
+};
\ No newline at end of file
diff --git a/sched/mfq/scx_mfq.bpf.c b/sched/mfq/scx_mfq.bpf.c
new file mode 100644
index 0000000000000000000000000000000000000000..714218dcc88f36eaa14f3c00e30d27d2a71e61cb
--- /dev/null
+++ b/sched/mfq/scx_mfq.bpf.c
@@ -0,0 +1,278 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A minimal userland scheduler.
+ *
+ * In terms of scheduling, this provides two different types of behaviors:
+ * 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity.
+ *    All such tasks are direct-dispatched from the kernel, and are never
+ *    enqueued in user space.
+ * 2. A primitive vruntime scheduler that is implemented in user space, for all
+ *    other tasks.
+ *
+ * Some parts of this example user space scheduler could be implemented more
+ * efficiently using more complex and sophisticated data structures. For
+ * example, rather than using BPF_MAP_TYPE_QUEUE's,
+ * BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between
+ * user space and kernel space. Similarly, we use a simple vruntime-sorted list
+ * in user space, but an rbtree could be used instead.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <string.h>
+#include "../scx_common.bpf.h"
+#include "scx_mfq_common.h"
+
+char _license[] SEC("license") = "GPL";
+
+const volatile bool switch_partial;
+const volatile s32 usersched_pid;
+
+/* !0 for veristat, set during init */
+const volatile u32 num_possible_cpus = 64;
+
+/* Stats that are printed by user space. */
+u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues;
+
+struct user_exit_info uei;
+
+/*
+ * Whether the user space scheduler needs to be scheduled due to a task being
+ * enqueued in user space.
+ */
+static bool usersched_needed;
+
+/*
+ * The map containing tasks that are enqueued in user space from the kernel.
+ *
+ * This map is drained by the user space scheduler.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, USERLAND_MAX_TASKS);
+	__type(value, struct scx_userland_enqueued_task);
+} enqueued SEC(".maps");
+
+/*
+ * The map containing tasks that are dispatched to the kernel from user space.
+ *
+ * Drained by the kernel in userland_dispatch().
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, USERLAND_MAX_TASKS);
+	__type(value, s32);
+} dispatched SEC(".maps");
+
+/* Per-task scheduling context */
+struct task_ctx {
+	bool force_local; /* Dispatch directly to local DSQ */
+};
+
+/* Map that contains task-local storage. */
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+static bool is_usersched_task(const struct task_struct *p)
+{
+	return p->pid == usersched_pid;
+}
+
+static bool keep_in_kernel(const struct task_struct *p)
+{
+	return p->nr_cpus_allowed < num_possible_cpus;
+}
+
+static struct task_struct *usersched_task(void)
+{
+	struct task_struct *p;
+
+	p = bpf_task_from_pid(usersched_pid);
+	/*
+	 * Should never happen -- the usersched task should always be managed
+	 * by sched_ext.
+	 */
+	if (!p)
+		scx_bpf_error("Failed to find usersched task %d", usersched_pid);
+
+	return p;
+}
+
+s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	if (keep_in_kernel(p)) {
+		s32 cpu;
+		struct task_ctx *tctx;
+
+		tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+		if (!tctx) {
+			scx_bpf_error("Failed to look up task-local storage for %s", p->comm);
+			return -ESRCH;
+		}
+
+		if (p->nr_cpus_allowed == 1 ||
+		    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+			tctx->force_local = true;
+			return prev_cpu;
+		}
+
+		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr);
+		if (cpu >= 0) {
+			tctx->force_local = true;
+			return cpu;
+		}
+	}
+
+	return prev_cpu;
+}
+
+static void dispatch_user_scheduler(void)
+{
+	struct task_struct *p;
+
+	usersched_needed = false;
+	p = usersched_task();
+	if (p) {
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+		bpf_task_release(p);
+	}
+}
+
+static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags)
+{
+	
+	struct scx_userland_enqueued_task task;
+
+	memset(&task, 0, sizeof(task));
+	task.pid = p->pid;
+	task.sum_exec_runtime = p->se.sum_exec_runtime;
+	task.weight = p->scx.weight;
+	// bpf_trace_printk("enqueue taggered! task->pid = %d, tgid = %d\n",p->pid,p->tgid);
+	if (bpf_map_push_elem(&enqueued, &task, 0)) {
+		/*
+		 * If we fail to enqueue the task in user space, put it
+		 * directly on the global DSQ.
+		 */
+		__sync_fetch_and_add(&nr_failed_enqueues, 1);
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	} else {
+		__sync_fetch_and_add(&nr_user_enqueues, 1);
+		usersched_needed = true;
+	}
+}
+
+void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	if (keep_in_kernel(p)) {
+		u64 dsq_id = SCX_DSQ_GLOBAL;
+		struct task_ctx *tctx;
+
+		/*
+		// Per-task scheduling context 
+		struct task_ctx {
+			bool force_local; /* Dispatch directly to local DSQ 
+		};
+		*/
+		tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+		if (!tctx) {
+			scx_bpf_error("Failed to lookup task ctx for %s", p->comm);
+			return;
+		}
+
+		if (tctx->force_local)
+			dsq_id = SCX_DSQ_LOCAL;
+		tctx->force_local = false;
+		scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags);
+		__sync_fetch_and_add(&nr_kernel_enqueues, 1);
+		return;
+	} else if (!is_usersched_task(p)) {
+		enqueue_task_in_user_space(p, enq_flags);
+	}
+}
+/*
+ * Called when a CPU's local dsq is empty. The operation should dispatch
+ * one or more tasks from the BPF scheduler into the DSQs using
+ * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using
+ * scx_bpf_consume().
+*/
+void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev)
+{
+	/*
+	* Whether the user space scheduler needs to be scheduled due to a task being
+	* enqueued in user space.
+	*/
+	if (usersched_needed)
+		dispatch_user_scheduler();
+
+	bpf_repeat(4096) {
+		s32 pid;
+		struct task_struct *p;
+
+		if (bpf_map_pop_elem(&dispatched, &pid))
+			break;
+
+		/*
+		 * The task could have exited by the time we get around to
+		 * dispatching it. Treat this as a normal occurrence, and simply
+		 * move onto the next iteration.
+		 */
+		p = bpf_task_from_pid(pid);
+		if (!p)
+			continue;
+
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+		bpf_task_release(p);
+	}
+}
+
+s32 BPF_STRUCT_OPS(userland_prep_enable, struct task_struct *p,
+		   struct scx_enable_args *args)
+{
+	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE))
+		return 0;
+	else
+		return -ENOMEM;
+}
+
+s32 BPF_STRUCT_OPS(userland_init)
+{
+	if (num_possible_cpus == 0) {
+		scx_bpf_error("User scheduler # CPUs uninitialized (%d)",
+			      num_possible_cpus);
+		return -EINVAL;
+	}
+
+	if (usersched_pid <= 0) {
+		scx_bpf_error("User scheduler pid uninitialized (%d)",
+			      usersched_pid);
+		return -EINVAL;
+	}
+
+	if (!switch_partial)
+		scx_bpf_switch_all();
+	return 0;
+}
+
+void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
+SEC(".struct_ops")
+struct sched_ext_ops userland_ops = {
+	.select_cpu		= (void *)userland_select_cpu,
+	.enqueue		= (void *)userland_enqueue,
+	.dispatch		= (void *)userland_dispatch,
+	.prep_enable		= (void *)userland_prep_enable,
+	.init			= (void *)userland_init,
+	.exit			= (void *)userland_exit,
+	.timeout_ms		= 3000,
+	.name			= "userland",
+};
diff --git a/sched/mfq/scx_mfq.c b/sched/mfq/scx_mfq.c
new file mode 100644
index 0000000000000000000000000000000000000000..814182456731fdbe2fec863032de8fabc72b7d19
--- /dev/null
+++ b/sched/mfq/scx_mfq.c
@@ -0,0 +1,368 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A demo sched_ext user space scheduler which provides vruntime semantics
+ * using a simple ordered-list implementation.
+ *
+ * Each CPU in the system resides in a single, global domain. This precludes
+ * the need to do any load balancing between domains. The scheduler could
+ * easily be extended to support multiple domains, with load balancing
+ * happening in user space.
+ *
+ * Any task which has any CPU affinity is scheduled entirely in BPF. This
+ * program only schedules tasks which may run on any CPU.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+//#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <sched.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <pthread.h>
+#include <bpf/bpf.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+#include <map>
+
+#include "../user_exit_info.h"
+#include "scx_mfq_common.h"
+#include "scx_mfq.skel.h"
+
+#include "mfq_sched.h"
+
+const char help_fmt[] =
+"A minimal userland sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-b BATCH] [-p]\n"
+"\n"
+"  -b BATCH      The number of tasks to batch when dispatching (default: 8)\n"
+"  -p            Don't switch all, switch only tasks on SCHED_EXT policy\n"
+"  -h            Display this help and exit\n";
+
+/* Defined in UAPI */
+#define SCHED_EXT 7
+
+/* Number of tasks to batch when dispatching to user space. */
+static __u32 batch_size = 8;
+
+static volatile int exit_req;
+static int enqueued_fd, dispatched_fd;
+
+static struct scx_mfq *skel;
+static struct bpf_link *ops_link;
+
+/* Stats collected in user space. */
+static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches;
+
+
+
+
+/*
+ * The statically allocated array of tasks. We use a statically allocated list
+ * here to avoid having to allocate on the enqueue path, which could cause a
+ * deadlock. A more substantive user space scheduler could e.g. provide a hook
+ * for newly enabled tasks that are passed to the scheduler from the
+ * .prep_enable() callback to allows the scheduler to allocate on safe paths.
+ */
+struct enqueued_task tasks[USERLAND_MAX_TASKS];
+
+/* queues for mfq. */
+MFQSched mfq(batch_size);
+
+static void sigint_handler(int userland)
+{
+	exit_req = 1;
+}
+
+//**************************************drain_enqueued_map*******************************************
+static struct enqueued_task *get_enqueued_task(__s32 pid)
+{
+	if (pid >= USERLAND_MAX_TASKS) 
+		return NULL;
+
+	return &tasks[pid];
+}
+
+static int mfq_enqueue(const struct scx_userland_enqueued_task *bpf_task)
+{
+	struct enqueued_task *curr = nullptr;
+
+	curr = get_enqueued_task(bpf_task->pid);
+	if (!curr) {
+		return ENOENT;
+	}
+
+	curr->sum_exec_runtime = bpf_task->sum_exec_runtime;
+	nr_vruntime_enqueues++;
+
+	/* Enqueue the task to the mfq queue */
+
+	bool ans = mfq.Enqueue(curr);
+	
+	if (!ans) {
+		printf("Enqueue error!\n");
+		exit_req = 1;
+		return ENOENT;
+	}
+
+	return 0;
+}
+
+static void drain_enqueued_map(void)
+{
+	while (1) {
+		
+		struct scx_userland_enqueued_task task;
+		int err;
+		if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)){
+			return;
+		}	
+
+		err = mfq_enqueue(&task);
+
+		if (err) {
+			fprintf(stderr, "Failed to enqueue task %d: %s\n",
+				task.pid, strerror(err));
+			exit_req = 1;
+			return;
+		}
+
+	}
+}
+//**************************************drain_enqueued_map*******************************************
+
+
+//**************************************dispatch_batch*****************************************
+static __u32 task_pid(const struct enqueued_task *task)
+{
+	return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task);
+}
+
+static int dispatch_task(s32 pid)
+{
+	int err;
+	
+	err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0);
+	if (err) {
+		fprintf(stderr, "Failed to dispatch task %d\n", pid);
+		exit_req = 1;
+	} else {
+		nr_vruntime_dispatches++;
+	}
+
+	return err;
+}
+
+static void dispatch_batch(void)
+{
+	auto thread_to_dispatch = mfq.Schedule();
+	
+	if (thread_to_dispatch.size() == 0) {
+		return;
+	}
+
+	for (auto task : thread_to_dispatch) {
+		__u32 pid = task_pid(task);
+		printf("%d schedule thread %d\n", getpid(), pid);
+		int err = dispatch_task(pid);
+		if (err) {
+			fprintf(stderr, "Failed to dispatch task %d\n", pid);
+			return;
+		}
+	}
+	return;
+	
+}
+//**************************************dispatch_batch*****************************************
+
+
+static void *run_stats_printer(void *arg)
+{
+	while (!exit_req) {
+		// __u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total;
+
+		// nr_failed_enqueues = skel->bss->nr_failed_enqueues;
+		// nr_kernel_enqueues = skel->bss->nr_kernel_enqueues;
+		// nr_user_enqueues = skel->bss->nr_user_enqueues;
+		// total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues;
+
+		// printf("o-----------------------o\n");
+		// printf("| BPF ENQUEUES          |\n");
+		// printf("|-----------------------|\n");
+		// printf("|  kern:     %10llu |\n", nr_kernel_enqueues);
+		// printf("|  user:     %10llu |\n", nr_user_enqueues);
+		// printf("|  failed:   %10llu |\n", nr_failed_enqueues);
+		// printf("|  -------------------- |\n");
+		// printf("|  total:    %10llu |\n", total);
+		// printf("|                       |\n");
+		// printf("|-----------------------|\n");
+		// printf("| VRUNTIME / USER       |\n");
+		// printf("|-----------------------|\n");
+		// printf("|  enq:      %10llu |\n", nr_vruntime_enqueues);
+		// printf("|  disp:     %10llu |\n", nr_vruntime_dispatches);
+		// printf("o-----------------------o\n");
+		// printf("%d\n",getpid());
+		// printf("\n\n");
+
+		// printf("o-----------------------o\n");
+		// printf("| BPF ENQUEUES          |\n");
+		// printf("|-----------------------|\n");
+		// printf("|  kern:     %10llu |\n", nr_kernel_enqueues);
+		// printf("|  user:     %10llu |\n", nr_user_enqueues);
+		// printf("|  failed:   %10llu |\n", nr_failed_enqueues);
+		// printf("|  -------------------- |\n");
+		// printf("|  total:    %10llu |\n", total);
+		// printf("|                       |\n");
+		// printf("|-----------------------|\n");
+		// printf("| VRUNTIME / USER       |\n");
+		// printf("|-----------------------|\n");
+		// printf("|  enq:      %10llu |\n", nr_vruntime_enqueues);
+		// printf("|  disp:     %10llu |\n", nr_vruntime_dispatches);
+		// printf("o-----------------------o\n");
+		// printf("\n\n");
+		sleep(1);
+	}
+
+	return NULL;
+}
+
+static int spawn_stats_thread(void)
+{
+	pthread_t stats_printer;
+
+	return pthread_create(&stats_printer, NULL, run_stats_printer, NULL);
+}
+
+static int bootstrap(int argc, char **argv)
+{
+	int err;
+	//__u32 opt;
+	int opt;
+	struct sched_param sched_param = {
+		.sched_priority = sched_get_priority_max(SCHED_EXT),
+	};
+	bool switch_partial = false;
+
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	/*
+	 * Enforce that the user scheduler task is managed by sched_ext. The
+	 * task eagerly drains the list of enqueued tasks in its main work
+	 * loop, and then yields the CPU. The BPF scheduler only schedules the
+	 * user space scheduler task when at least one other task in the system
+	 * needs to be scheduled.
+	 */
+	err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param);
+	if (err) {
+		fprintf(stderr, "Failed to set scheduler to SCHED_EXT: %s\n", strerror(err));
+		return err;
+	}
+
+	while ((opt = getopt(argc, argv, "b:ph")) != -1) {
+		switch (opt) {
+		case 'b':
+			batch_size = strtoul(optarg, NULL, 0);
+			break;
+		case 'p':
+			switch_partial = true;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			exit(opt != 'h');
+		}
+	}
+
+
+	/*
+	 * It's not always safe to allocate in a user space scheduler, as an
+	 * enqueued task could hold a lock that we require in order to be able
+	 * to allocate.
+	 */
+	err = mlockall(MCL_CURRENT | MCL_FUTURE);
+	if (err) {
+		fprintf(stderr, "Failed to prefault and lock address space: %s\n",
+			strerror(err));
+		return err;
+	}
+
+	skel = scx_mfq__open();
+	if (!skel) {
+		fprintf(stderr, "Failed to open scheduler: %s\n", strerror(errno));
+		return errno;
+	}
+	skel->rodata->num_possible_cpus = libbpf_num_possible_cpus();
+	assert(skel->rodata->num_possible_cpus > 0);
+	skel->rodata->usersched_pid = getpid();
+	assert(skel->rodata->usersched_pid > 0);
+	skel->rodata->switch_partial = switch_partial;
+
+	err = scx_mfq__load(skel);
+	if (err) {
+		fprintf(stderr, "Failed to load scheduler: %s\n", strerror(err));
+		goto destroy_skel;
+	}
+
+	enqueued_fd = bpf_map__fd(skel->maps.enqueued);
+	dispatched_fd = bpf_map__fd(skel->maps.dispatched);
+	assert(enqueued_fd > 0);
+	assert(dispatched_fd > 0);
+
+	err = spawn_stats_thread();// 生出一个打印线程
+	if (err) {
+		fprintf(stderr, "Failed to spawn stats thread: %s\n", strerror(err));
+		goto destroy_skel;
+	}
+
+	ops_link = bpf_map__attach_struct_ops(skel->maps.userland_ops);
+	if (!ops_link) {
+		fprintf(stderr, "Failed to attach struct ops: %s\n", strerror(errno));
+		err = errno;
+		goto destroy_skel;
+	}
+
+	return 0;
+
+destroy_skel:
+	scx_mfq__destroy(skel);
+	exit_req = 1;
+	return err;
+}
+
+static void sched_main_loop(void)
+{
+	while (!exit_req) {
+		drain_enqueued_map();
+		dispatch_batch();
+		sched_yield();
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int err;
+
+	// init mfq
+	// mfq = MFQSched((size_t)batch_size);
+	err = bootstrap(argc, argv);
+	if (err) {
+		fprintf(stderr, "Failed to bootstrap scheduler: %s\n", strerror(err));
+		return err;
+	}
+
+	sched_main_loop();
+
+	exit_req = 1;
+	bpf_link__destroy(ops_link);
+	uei_print(&skel->bss->uei);
+	scx_mfq__destroy(skel);
+	return 0;
+}
diff --git a/sched/mfq/scx_mfq_common.h b/sched/mfq/scx_mfq_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..8382bbc0b983e0d576c8dc8e86c4ae497684f014
--- /dev/null
+++ b/sched/mfq/scx_mfq_common.h
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta, Inc */
+
+#ifndef __SCX_USERLAND_COMMON_H
+#define __SCX_USERLAND_COMMON_H
+
+#define USERLAND_MAX_TASKS 60000
+
+#define MFQ_QUEUE_NO1_SIZE 1000
+#define MFQ_QUEUE_NO2_SIZE 2000
+#define MFQ_QUEUE_NO3_SIZE 3000
+#define MFQ_QUEUE_NO4_SIZE 4000
+#define MFQ_QUEUE_NO5_SIZE 5000
+
+
+#include "../../lib/cos_client.h"
+#include "../../lib/cos.h"
+
+/*
+ * An instance of a task that has been enqueued by the kernel for consumption
+ * by a user space global scheduler thread.
+ * task的一个实例,它可以被kernel enqueue,用于被userspace的global scheduler消费
+ */
+struct scx_userland_enqueued_task {
+	__s32 pid;
+	u64 sum_exec_runtime;
+	u64 weight;
+	__s32 queue_id; 
+};
+
+#endif  // __SCX_USERLAND_COMMON_H
diff --git a/sched/scx_common.bpf.h b/sched/scx_common.bpf.h
new file mode 100644
index 0000000000000000000000000000000000000000..e56de9dc86f288caf1e9fbdd109cb0a09991fbc1
--- /dev/null
+++ b/sched/scx_common.bpf.h
@@ -0,0 +1,288 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifndef __SCHED_EXT_COMMON_BPF_H
+#define __SCHED_EXT_COMMON_BPF_H
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <linux/errno.h>
+#include "user_exit_info.h"
+
+#define PF_KTHREAD			0x00200000	/* I am a kernel thread */
+#define PF_EXITING			0x00000004
+#define CLOCK_MONOTONIC			1
+
+/*
+ * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can
+ * lead to really confusing misbehaviors. Let's trigger a build failure.
+ */
+static inline void ___vmlinux_h_sanity_check___(void)
+{
+	_Static_assert(SCX_DSQ_FLAG_BUILTIN,
+		       "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole");
+}
+
+void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
+
+static inline __attribute__((format(printf, 1, 2)))
+void ___scx_bpf_error_format_checker(const char *fmt, ...) {}
+
+/*
+ * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments
+ * instead of an array of u64. Note that __param[] must have at least one
+ * element to keep the verifier happy.
+ */
+#define scx_bpf_error(fmt, args...)						\
+({										\
+	static char ___fmt[] = fmt;						\
+	unsigned long long ___param[___bpf_narg(args) ?: 1] = {};		\
+										\
+	_Pragma("GCC diagnostic push")						\
+	_Pragma("GCC diagnostic ignored \"-Wint-conversion\"")			\
+	___bpf_fill(___param, args);						\
+	_Pragma("GCC diagnostic pop")						\
+										\
+	scx_bpf_error_bstr(___fmt, ___param, sizeof(___param));			\
+										\
+	___scx_bpf_error_format_checker(fmt, ##args);				\
+})
+
+void scx_bpf_switch_all(void) __ksym;
+s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
+bool scx_bpf_consume(u64 dsq_id) __ksym;
+u32 scx_bpf_dispatch_nr_slots(void) __ksym;
+void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
+void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym;
+void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
+s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
+bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
+s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed) __ksym;
+const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
+const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
+void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
+void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
+bool scx_bpf_task_running(const struct task_struct *p) __ksym;
+s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
+struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
+u32 scx_bpf_reenqueue_local(void) __ksym;
+
+#define BPF_STRUCT_OPS(name, args...)						\
+SEC("struct_ops/"#name)								\
+BPF_PROG(name, ##args)
+
+#define BPF_STRUCT_OPS_SLEEPABLE(name, args...)					\
+SEC("struct_ops.s/"#name)							\
+BPF_PROG(name, ##args)
+
+/**
+ * MEMBER_VPTR - Obtain the verified pointer to a struct or array member
+ * @base: struct or array to index
+ * @member: dereferenced member (e.g. ->field, [idx0][idx1], ...)
+ *
+ * The verifier often gets confused by the instruction sequence the compiler
+ * generates for indexing struct fields or arrays. This macro forces the
+ * compiler to generate a code sequence which first calculates the byte offset,
+ * checks it against the struct or array size and add that byte offset to
+ * generate the pointer to the member to help the verifier.
+ *
+ * Ideally, we want to abort if the calculated offset is out-of-bounds. However,
+ * BPF currently doesn't support abort, so evaluate to NULL instead. The caller
+ * must check for NULL and take appropriate action to appease the verifier. To
+ * avoid confusing the verifier, it's best to check for NULL and dereference
+ * immediately.
+ *
+ *	vptr = MEMBER_VPTR(my_array, [i][j]);
+ *	if (!vptr)
+ *		return error;
+ *	*vptr = new_value;
+ */
+#define MEMBER_VPTR(base, member) (typeof(base member) *)({			\
+	u64 __base = (u64)base;							\
+	u64 __addr = (u64)&(base member) - __base;				\
+	asm volatile (								\
+		"if %0 <= %[max] goto +2\n"					\
+		"%0 = 0\n"							\
+		"goto +1\n"							\
+		"%0 += %1\n"							\
+		: "+r"(__addr)							\
+		: "r"(__base),							\
+		  [max]"i"(sizeof(base) - sizeof(base member)));		\
+	__addr;									\
+})
+
+/*
+ * BPF core and other generic helpers
+ */
+
+/* list and rbtree */
+#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node)))
+#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8)))
+
+void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym;
+void bpf_obj_drop_impl(void *kptr, void *meta) __ksym;
+
+#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL))
+#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL)
+
+void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
+void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
+struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym;
+struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym;
+struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
+				      struct bpf_rb_node *node) __ksym;
+void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+		    bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) __ksym;
+struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;
+
+/* task */
+struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
+struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
+void bpf_task_release(struct task_struct *p) __ksym;
+
+/* cgroup */
+struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym;
+void bpf_cgroup_release(struct cgroup *cgrp) __ksym;
+struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym;
+
+/* cpumask */
+struct bpf_cpumask *bpf_cpumask_create(void) __ksym;
+struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym;
+u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym;
+u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym;
+void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1,
+		     const struct cpumask *src2) __ksym;
+void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1,
+		    const struct cpumask *src2) __ksym;
+void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1,
+		     const struct cpumask *src2) __ksym;
+bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym;
+bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym;
+void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym;
+u32 bpf_cpumask_any(const struct cpumask *cpumask) __ksym;
+u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+
+/* rcu */
+void bpf_rcu_read_lock(void) __ksym;
+void bpf_rcu_read_unlock(void) __ksym;
+
+/* BPF core iterators from tools/testing/selftests/bpf/progs/bpf_misc.h */
+struct bpf_iter_num;
+
+extern int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) __ksym;
+extern int *bpf_iter_num_next(struct bpf_iter_num *it) __ksym;
+extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __ksym;
+
+#ifndef bpf_for_each
+/* bpf_for_each(iter_type, cur_elem, args...) provides generic construct for
+ * using BPF open-coded iterators without having to write mundane explicit
+ * low-level loop logic. Instead, it provides for()-like generic construct
+ * that can be used pretty naturally. E.g., for some hypothetical cgroup
+ * iterator, you'd write:
+ *
+ * struct cgroup *cg, *parent_cg = <...>;
+ *
+ * bpf_for_each(cgroup, cg, parent_cg, CG_ITER_CHILDREN) {
+ *     bpf_printk("Child cgroup id = %d", cg->cgroup_id);
+ *     if (cg->cgroup_id == 123)
+ *         break;
+ * }
+ *
+ * I.e., it looks almost like high-level for each loop in other languages,
+ * supports continue/break, and is verifiable by BPF verifier.
+ *
+ * For iterating integers, the difference betwen bpf_for_each(num, i, N, M)
+ * and bpf_for(i, N, M) is in that bpf_for() provides additional proof to
+ * verifier that i is in [N, M) range, and in bpf_for_each() case i is `int
+ * *`, not just `int`. So for integers bpf_for() is more convenient.
+ *
+ * Note: this macro relies on C99 feature of allowing to declare variables
+ * inside for() loop, bound to for() loop lifetime. It also utilizes GCC
+ * extension: __attribute__((cleanup(<func>))), supported by both GCC and
+ * Clang.
+ */
+#define bpf_for_each(type, cur, args...) for (							\
+	/* initialize and define destructor */							\
+	struct bpf_iter_##type ___it __attribute__((aligned(8), /* enforce, just in case */,	\
+						    cleanup(bpf_iter_##type##_destroy))),	\
+	/* ___p pointer is just to call bpf_iter_##type##_new() *once* to init ___it */		\
+			       *___p __attribute__((unused)) = (				\
+					bpf_iter_##type##_new(&___it, ##args),			\
+	/* this is a workaround for Clang bug: it currently doesn't emit BTF */			\
+	/* for bpf_iter_##type##_destroy() when used from cleanup() attribute */		\
+					(void)bpf_iter_##type##_destroy, (void *)0);		\
+	/* iteration and termination check */							\
+	(((cur) = bpf_iter_##type##_next(&___it)));						\
+)
+#endif /* bpf_for_each */
+
+#ifndef bpf_for
+/* bpf_for(i, start, end) implements a for()-like looping construct that sets
+ * provided integer variable *i* to values starting from *start* through,
+ * but not including, *end*. It also proves to BPF verifier that *i* belongs
+ * to range [start, end), so this can be used for accessing arrays without
+ * extra checks.
+ *
+ * Note: *start* and *end* are assumed to be expressions with no side effects
+ * and whose values do not change throughout bpf_for() loop execution. They do
+ * not have to be statically known or constant, though.
+ *
+ * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for()
+ * loop bound variables and cleanup attribute, supported by GCC and Clang.
+ */
+#define bpf_for(i, start, end) for (								\
+	/* initialize and define destructor */							\
+	struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */	\
+						 cleanup(bpf_iter_num_destroy))),		\
+	/* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */		\
+			    *___p __attribute__((unused)) = (					\
+				bpf_iter_num_new(&___it, (start), (end)),			\
+	/* this is a workaround for Clang bug: it currently doesn't emit BTF */			\
+	/* for bpf_iter_num_destroy() when used from cleanup() attribute */			\
+				(void)bpf_iter_num_destroy, (void *)0);				\
+	({											\
+		/* iteration step */								\
+		int *___t = bpf_iter_num_next(&___it);						\
+		/* termination and bounds check */						\
+		(___t && ((i) = *___t, (i) >= (start) && (i) < (end)));				\
+	});											\
+)
+#endif /* bpf_for */
+
+#ifndef bpf_repeat
+/* bpf_repeat(N) performs N iterations without exposing iteration number
+ *
+ * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for()
+ * loop bound variables and cleanup attribute, supported by GCC and Clang.
+ */
+#define bpf_repeat(N) for (									\
+	/* initialize and define destructor */							\
+	struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */	\
+						 cleanup(bpf_iter_num_destroy))),		\
+	/* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */		\
+			    *___p __attribute__((unused)) = (					\
+				bpf_iter_num_new(&___it, 0, (N)),				\
+	/* this is a workaround for Clang bug: it currently doesn't emit BTF */			\
+	/* for bpf_iter_num_destroy() when used from cleanup() attribute */			\
+				(void)bpf_iter_num_destroy, (void *)0);				\
+	bpf_iter_num_next(&___it);								\
+	/* nothing here  */									\
+)
+#endif /* bpf_repeat */
+
+#endif	/* __SCHED_EXT_COMMON_BPF_H */
diff --git a/sched/sjf/scx_sjf.bpf.c b/sched/sjf/scx_sjf.bpf.c
new file mode 100644
index 0000000000000000000000000000000000000000..7f5b6afee49d08f65f038bca8639ba94dbc33013
--- /dev/null
+++ b/sched/sjf/scx_sjf.bpf.c
@@ -0,0 +1,279 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A minimal userland scheduler.
+ *
+ * In terms of scheduling, this provides two different types of behaviors:
+ * 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity.
+ *    All such tasks are direct-dispatched from the kernel, and are never
+ *    enqueued in user space.
+ * 2. A primitive vruntime scheduler that is implemented in user space, for all
+ *    other tasks.
+ *
+ * Some parts of this example user space scheduler could be implemented more
+ * efficiently using more complex and sophisticated data structures. For
+ * example, rather than using BPF_MAP_TYPE_QUEUE's,
+ * BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between
+ * user space and kernel space. Similarly, we use a simple vruntime-sorted list
+ * in user space, but an rbtree could be used instead.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <string.h>
+#include "../scx_common.bpf.h"
+#include "scx_sjf_common.h"
+
+char _license[] SEC("license") = "GPL";
+
+const volatile bool switch_partial;
+const volatile s32 usersched_pid;
+
+/* !0 for veristat, set during init */
+const volatile u32 num_possible_cpus = 64;
+
+/* Stats that are printed by user space. */
+u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues;
+
+struct user_exit_info uei;
+
+/*
+ * Whether the user space scheduler needs to be scheduled due to a task being
+ * enqueued in user space.
+ */
+static bool usersched_needed;
+
+/*
+ * The map containing tasks that are enqueued in user space from the kernel.
+ *
+ * This map is drained by the user space scheduler.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, USERLAND_MAX_TASKS);
+	__type(value, struct scx_userland_enqueued_task);
+} enqueued SEC(".maps");
+
+/*
+ * The map containing tasks that are dispatched to the kernel from user space.
+ *
+ * Drained by the kernel in userland_dispatch().
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, USERLAND_MAX_TASKS);
+	__type(value, s32);
+} dispatched SEC(".maps");
+
+/* Per-task scheduling context */
+struct task_ctx {
+	bool force_local; /* Dispatch directly to local DSQ */
+};
+
+/* Map that contains task-local storage. */
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+static bool is_usersched_task(const struct task_struct *p)
+{
+	return p->pid == usersched_pid;
+}
+
+static bool keep_in_kernel(const struct task_struct *p)
+{
+	return p->nr_cpus_allowed < num_possible_cpus;
+}
+
+static struct task_struct *usersched_task(void)
+{
+	struct task_struct *p;
+
+	p = bpf_task_from_pid(usersched_pid);
+	/*
+	 * Should never happen -- the usersched task should always be managed
+	 * by sched_ext.
+	 */
+	if (!p)
+		scx_bpf_error("Failed to find usersched task %d", usersched_pid);
+
+	return p;
+}
+
+s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	if (keep_in_kernel(p)) {
+		s32 cpu;
+		struct task_ctx *tctx;
+
+		tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+		if (!tctx) {
+			scx_bpf_error("Failed to look up task-local storage for %s", p->comm);
+			return -ESRCH;
+		}
+
+		if (p->nr_cpus_allowed == 1 ||
+		    scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
+			tctx->force_local = true;
+			return prev_cpu;
+		}
+
+		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr);
+		if (cpu >= 0) {
+			tctx->force_local = true;
+			return cpu;
+		}
+	}
+
+	return prev_cpu;
+}
+
+static void dispatch_user_scheduler(void)
+{
+	struct task_struct *p;
+
+	usersched_needed = false;
+	p = usersched_task();
+	if (p) {
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+		bpf_task_release(p);
+	}
+}
+
+static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags)
+{
+	
+	struct scx_userland_enqueued_task task;
+
+	memset(&task, 0, sizeof(task));
+	task.pid = p->pid;
+	task.sum_exec_runtime = p->se.sum_exec_runtime;
+	task.weight = p->scx.weight;
+	task.tgid = p->tgid;
+	// bpf_trace_printk("enqueue taggered! task->pid = %d, tgid = %d\n",p->pid,p->tgid);
+	if (bpf_map_push_elem(&enqueued, &task, 0)) {
+		/*
+		 * If we fail to enqueue the task in user space, put it
+		 * directly on the global DSQ.
+		 */
+		__sync_fetch_and_add(&nr_failed_enqueues, 1);
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+	} else {
+		__sync_fetch_and_add(&nr_user_enqueues, 1);
+		usersched_needed = true;
+	}
+}
+
+void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	if (keep_in_kernel(p)) {
+		u64 dsq_id = SCX_DSQ_GLOBAL;
+		struct task_ctx *tctx;
+
+		/*
+		// Per-task scheduling context 
+		struct task_ctx {
+			bool force_local; /* Dispatch directly to local DSQ 
+		};
+		*/
+		tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+		if (!tctx) {
+			scx_bpf_error("Failed to lookup task ctx for %s", p->comm);
+			return;
+		}
+
+		if (tctx->force_local)
+			dsq_id = SCX_DSQ_LOCAL;
+		tctx->force_local = false;
+		scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags);
+		__sync_fetch_and_add(&nr_kernel_enqueues, 1);
+		return;
+	} else if (!is_usersched_task(p)) {
+		enqueue_task_in_user_space(p, enq_flags);
+	}
+}
+/*
+ * Called when a CPU's local dsq is empty. The operation should dispatch
+ * one or more tasks from the BPF scheduler into the DSQs using
+ * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using
+ * scx_bpf_consume().
+*/
+void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev)
+{
+	/*
+	* Whether the user space scheduler needs to be scheduled due to a task being
+	* enqueued in user space.
+	*/
+	if (usersched_needed)
+		dispatch_user_scheduler();
+
+	bpf_repeat(4096) {
+		s32 pid;
+		struct task_struct *p;
+
+		if (bpf_map_pop_elem(&dispatched, &pid))
+			break;
+
+		/*
+		 * The task could have exited by the time we get around to
+		 * dispatching it. Treat this as a normal occurrence, and simply
+		 * move onto the next iteration.
+		 */
+		p = bpf_task_from_pid(pid);
+		if (!p)
+			continue;
+
+		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+		bpf_task_release(p);
+	}
+}
+
+s32 BPF_STRUCT_OPS(userland_prep_enable, struct task_struct *p,
+		   struct scx_enable_args *args)
+{
+	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE))
+		return 0;
+	else
+		return -ENOMEM;
+}
+
+s32 BPF_STRUCT_OPS(userland_init)
+{
+	if (num_possible_cpus == 0) {
+		scx_bpf_error("User scheduler # CPUs uninitialized (%d)",
+			      num_possible_cpus);
+		return -EINVAL;
+	}
+
+	if (usersched_pid <= 0) {
+		scx_bpf_error("User scheduler pid uninitialized (%d)",
+			      usersched_pid);
+		return -EINVAL;
+	}
+
+	if (!switch_partial)
+		scx_bpf_switch_all();
+	return 0;
+}
+
+void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei)
+{
+	uei_record(&uei, ei);
+}
+
+SEC(".struct_ops")
+struct sched_ext_ops userland_ops = {
+	.select_cpu		= (void *)userland_select_cpu,
+	.enqueue		= (void *)userland_enqueue,
+	.dispatch		= (void *)userland_dispatch,
+	.prep_enable		= (void *)userland_prep_enable,
+	.init			= (void *)userland_init,
+	.exit			= (void *)userland_exit,
+	.timeout_ms		= 3000,
+	.name			= "userland",
+};
diff --git a/sched/sjf/scx_sjf.c b/sched/sjf/scx_sjf.c
new file mode 100644
index 0000000000000000000000000000000000000000..60dce55d2c0d82a3ec6a6ebd7e1368cba7dcaf10
--- /dev/null
+++ b/sched/sjf/scx_sjf.c
@@ -0,0 +1,451 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A demo sched_ext user space scheduler which provides vruntime semantics
+ * using a simple ordered-list implementation.
+ *
+ * Each CPU in the system resides in a single, global domain. This precludes
+ * the need to do any load balancing between domains. The scheduler could
+ * easily be extended to support multiple domains, with load balancing
+ * happening in user space.
+ *
+ * Any task which has any CPU affinity is scheduled entirely in BPF. This
+ * program only schedules tasks which may run on any CPU.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+//#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <sched.h>
+#include <signal.h>
+#include <assert.h>
+#include <libgen.h>
+#include <pthread.h>
+#include <bpf/bpf.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+#include <map>
+
+#include "../user_exit_info.h"
+#include "scx_sjf_common.h"
+#include "scx_sjf.skel.h"
+
+#include "hash.h"
+
+const char help_fmt[] =
+"A minimal userland sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-b BATCH] [-p]\n"
+"\n"
+"  -b BATCH      The number of tasks to batch when dispatching (default: 8)\n"
+"  -p            Don't switch all, switch only tasks on SCHED_EXT policy\n"
+"  -h            Display this help and exit\n";
+
+/* Defined in UAPI */
+#define SCHED_EXT 7
+
+/* Number of tasks to batch when dispatching to user space. */
+static __u32 batch_size = 8;
+
+static volatile int exit_req;
+static int enqueued_fd, dispatched_fd;
+
+static struct scx_sjf *skel;
+static struct bpf_link *ops_link;
+
+/* Stats collected in user space. */
+static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches;
+
+/* The data structure containing tasks that are enqueued in user space. */
+struct enqueued_task {
+	LIST_ENTRY(enqueued_task) entries;
+	__u64 sum_exec_runtime;
+	double vruntime;
+};
+
+/*
+ * Use a vruntime-sorted list to store tasks. This could easily be extended to
+ * a more optimal data structure, such as an rbtree as is done in CFS. We
+ * currently elect to use a sorted list to simplify the example for
+ * illustrative purposes.
+ */
+LIST_HEAD(listhead, enqueued_task);
+
+/*
+ * A vruntime-sorted list of tasks. The head of the list contains the task with
+ * the lowest vruntime. That is, the task that has the "highest" claim to be
+ * scheduled.
+ */
+static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head);
+
+/*
+ * The statically allocated array of tasks. We use a statically allocated list
+ * here to avoid having to allocate on the enqueue path, which could cause a
+ * deadlock. A more substantive user space scheduler could e.g. provide a hook
+ * for newly enabled tasks that are passed to the scheduler from the
+ * .prep_enable() callback to allows the scheduler to allocate on safe paths.
+ */
+struct enqueued_task tasks[USERLAND_MAX_TASKS];
+
+static double min_vruntime;
+
+static void sigint_handler(int userland)
+{
+	exit_req = 1;
+}
+
+static __u32 task_pid(const struct enqueued_task *task)
+{
+	return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task);
+}
+
+static int dispatch_task(s32 pid)
+{
+	int err;
+	
+	err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0);
+	if (err) {
+		fprintf(stderr, "Failed to dispatch task %d\n", pid);
+		exit_req = 1;
+	} else {
+		nr_vruntime_dispatches++;
+	}
+
+	return err;
+}
+
+static struct enqueued_task *get_enqueued_task(__s32 pid)
+{
+	if (pid >= USERLAND_MAX_TASKS) 
+		return NULL;
+
+	return &tasks[pid];
+}
+
+static double calc_vruntime_delta(__u64 weight, __u64 delta)
+{
+	double weight_f = (double)weight / 100.0;
+	double delta_f = (double)delta;
+
+	return delta_f / weight_f;
+}
+
+static void update_enqueued(struct enqueued_task *enqueued, const struct scx_userland_enqueued_task *bpf_task)
+{
+	__u64 delta;
+
+	delta = bpf_task->sum_exec_runtime - enqueued->sum_exec_runtime;
+
+	enqueued->vruntime += calc_vruntime_delta(bpf_task->weight, delta);
+	if (min_vruntime > enqueued->vruntime)
+		enqueued->vruntime = min_vruntime;
+	enqueued->sum_exec_runtime = bpf_task->sum_exec_runtime;
+}
+
+static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task)
+{
+	struct enqueued_task *curr, *enqueued, *prev;
+
+	curr = get_enqueued_task(bpf_task->pid);
+	if (!curr)
+		return ENOENT;
+
+	update_enqueued(curr, bpf_task);
+	nr_vruntime_enqueues++;
+
+	/*
+	 * Enqueue the task in a vruntime-sorted list. A more optimal data
+	 * structure such as an rbtree could easily be used as well. We elect
+	 * to use a list here simply because it's less code, and thus the
+	 * example is less convoluted and better serves to illustrate what a
+	 * user space scheduler could look like.
+	 */
+
+	if (LIST_EMPTY(&vruntime_head)) {
+		LIST_INSERT_HEAD(&vruntime_head, curr, entries);
+		return 0;
+	}
+
+	LIST_FOREACH(enqueued, &vruntime_head, entries) {
+		if (curr->vruntime <= enqueued->vruntime) {
+			LIST_INSERT_BEFORE(enqueued, curr, entries);
+			return 0;
+		}
+		prev = enqueued;
+	}
+
+	LIST_INSERT_AFTER(prev, curr, entries);
+
+	return 0;
+}
+
+
+#include <fcntl.h> // for O_RDWR and open
+
+// std::map<__s32, LFHashTable<int32_t>> tgid2hashtable; 
+std::map<__s32, LFHashTable<struct entry>> tgid2hashtable; 
+
+static void drain_enqueued_map(void)
+{
+	while (1) {
+		struct scx_userland_enqueued_task task;
+		int err;
+		if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)){
+			return;
+		}	
+
+
+		/* do schedule */
+		__s32 tgid = task.tgid; 
+
+		if(tgid != getpid()){
+			printf("%dagent 调度 线程%d 所属进程%d\n",getpid(),task.pid, tgid);
+
+			if (tgid2hashtable.count(tgid) == 0) { 
+				char buf[128];
+				sprintf(buf, "/etc/cos/shm/shm_%d", tgid);
+				int shm_fd = open(buf, O_RDWR);
+				void* shm = mmap(NULL, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0);
+				if(shm == MAP_FAILED) { // the client doesn't use share memory
+					goto enqueue;
+				}
+				tgid2hashtable[tgid] = LFHashTable<struct entry>(shm, SHM_SIZE, 0);
+			}
+
+			struct entry tmp = tgid2hashtable[tgid].Get(task.pid);
+			memcpy(&(task.data), &tmp, sizeof(struct entry));
+			// task.ddl = tgid2hashtable[tgid].Get(task.pid);
+			printf("thread %d ddl=%d\n", task.pid, task.data.ddl);
+
+		}
+
+		enqueue:
+		err = vruntime_enqueue(&task);
+		if (err) {
+			fprintf(stderr, "Failed to enqueue task %d: %s\n",
+				task.pid, strerror(err));
+			exit_req = 1;
+
+			return;
+		}
+
+
+	}
+}
+
+// 把vruntime sorted list的batch个task dispatch到kernel中(???6)
+static void dispatch_batch(void)
+{
+	__u32 i;
+
+	for (i = 0; i < batch_size; i++) {
+		/*
+		The data structure containing tasks that are enqueued in user space. 
+			struct enqueued_task {
+		*/
+		struct enqueued_task *task;
+		int err;
+		__s32 pid;
+
+		task = LIST_FIRST(&vruntime_head);
+		if (!task)
+			return;
+
+		min_vruntime = task->vruntime;
+		pid = task_pid(task);
+		LIST_REMOVE(task, entries);
+		err = dispatch_task(pid);
+		if (err) {
+			fprintf(stderr, "Failed to dispatch task %d in %u\n",
+				pid, i);
+			return;
+		}
+	}
+}
+
+static void *run_stats_printer(void *arg)
+{
+	while (!exit_req) {
+		// __u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total;
+
+		// nr_failed_enqueues = skel->bss->nr_failed_enqueues;
+		// nr_kernel_enqueues = skel->bss->nr_kernel_enqueues;
+		// nr_user_enqueues = skel->bss->nr_user_enqueues;
+		// total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues;
+
+		// printf("o-----------------------o\n");
+		// printf("| BPF ENQUEUES          |\n");
+		// printf("|-----------------------|\n");
+		// printf("|  kern:     %10llu |\n", nr_kernel_enqueues);
+		// printf("|  user:     %10llu |\n", nr_user_enqueues);
+		// printf("|  failed:   %10llu |\n", nr_failed_enqueues);
+		// printf("|  -------------------- |\n");
+		// printf("|  total:    %10llu |\n", total);
+		// printf("|                       |\n");
+		// printf("|-----------------------|\n");
+		// printf("| VRUNTIME / USER       |\n");
+		// printf("|-----------------------|\n");
+		// printf("|  enq:      %10llu |\n", nr_vruntime_enqueues);
+		// printf("|  disp:     %10llu |\n", nr_vruntime_dispatches);
+		// printf("o-----------------------o\n");
+		// printf("%d\n",getpid());
+		// printf("\n\n");
+
+		// printf("o-----------------------o\n");
+		// printf("| BPF ENQUEUES          |\n");
+		// printf("|-----------------------|\n");
+		// printf("|  kern:     %10llu |\n", nr_kernel_enqueues);
+		// printf("|  user:     %10llu |\n", nr_user_enqueues);
+		// printf("|  failed:   %10llu |\n", nr_failed_enqueues);
+		// printf("|  -------------------- |\n");
+		// printf("|  total:    %10llu |\n", total);
+		// printf("|                       |\n");
+		// printf("|-----------------------|\n");
+		// printf("| VRUNTIME / USER       |\n");
+		// printf("|-----------------------|\n");
+		// printf("|  enq:      %10llu |\n", nr_vruntime_enqueues);
+		// printf("|  disp:     %10llu |\n", nr_vruntime_dispatches);
+		// printf("o-----------------------o\n");
+		// printf("\n\n");
+		sleep(1);
+	}
+
+	return NULL;
+}
+
+static int spawn_stats_thread(void)
+{
+	pthread_t stats_printer;
+
+	return pthread_create(&stats_printer, NULL, run_stats_printer, NULL);
+}
+
+static int bootstrap(int argc, char **argv)
+{
+	int err;
+	//__u32 opt;
+	int opt;
+	struct sched_param sched_param = {
+		.sched_priority = sched_get_priority_max(SCHED_EXT),
+	};
+	bool switch_partial = false;
+
+	signal(SIGINT, sigint_handler);
+	signal(SIGTERM, sigint_handler);
+	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
+
+	/*
+	 * Enforce that the user scheduler task is managed by sched_ext. The
+	 * task eagerly drains the list of enqueued tasks in its main work
+	 * loop, and then yields the CPU. The BPF scheduler only schedules the
+	 * user space scheduler task when at least one other task in the system
+	 * needs to be scheduled.
+	 */
+	err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param);
+	if (err) {
+		fprintf(stderr, "Failed to set scheduler to SCHED_EXT: %s\n", strerror(err));
+		return err;
+	}
+
+	while ((opt = getopt(argc, argv, "b:ph")) != -1) {
+		switch (opt) {
+		case 'b':
+			batch_size = strtoul(optarg, NULL, 0);
+			break;
+		case 'p':
+			switch_partial = true;
+			break;
+		default:
+			fprintf(stderr, help_fmt, basename(argv[0]));
+			exit(opt != 'h');
+		}
+	}
+
+
+	/*
+	 * It's not always safe to allocate in a user space scheduler, as an
+	 * enqueued task could hold a lock that we require in order to be able
+	 * to allocate.
+	 */
+	err = mlockall(MCL_CURRENT | MCL_FUTURE);
+	if (err) {
+		fprintf(stderr, "Failed to prefault and lock address space: %s\n",
+			strerror(err));
+		return err;
+	}
+
+	skel = scx_sjf__open();
+	if (!skel) {
+		fprintf(stderr, "Failed to open scheduler: %s\n", strerror(errno));
+		return errno;
+	}
+	skel->rodata->num_possible_cpus = libbpf_num_possible_cpus();
+	assert(skel->rodata->num_possible_cpus > 0);
+	skel->rodata->usersched_pid = getpid();
+	assert(skel->rodata->usersched_pid > 0);
+	skel->rodata->switch_partial = switch_partial;
+
+	err = scx_sjf__load(skel);
+	if (err) {
+		fprintf(stderr, "Failed to load scheduler: %s\n", strerror(err));
+		goto destroy_skel;
+	}
+
+	enqueued_fd = bpf_map__fd(skel->maps.enqueued);
+	dispatched_fd = bpf_map__fd(skel->maps.dispatched);
+	assert(enqueued_fd > 0);
+	assert(dispatched_fd > 0);
+
+	err = spawn_stats_thread();// 生出一个打印线程
+	if (err) {
+		fprintf(stderr, "Failed to spawn stats thread: %s\n", strerror(err));
+		goto destroy_skel;
+	}
+
+	ops_link = bpf_map__attach_struct_ops(skel->maps.userland_ops);
+	if (!ops_link) {
+		fprintf(stderr, "Failed to attach struct ops: %s\n", strerror(errno));
+		err = errno;
+		goto destroy_skel;
+	}
+
+	return 0;
+
+destroy_skel:
+	scx_sjf__destroy(skel);
+	exit_req = 1;
+	return err;
+}
+
+static void sched_main_loop(void)
+{
+	while (!exit_req) {
+		drain_enqueued_map();
+		dispatch_batch();
+		sched_yield();
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int err;
+
+	err = bootstrap(argc, argv);
+	if (err) {
+		fprintf(stderr, "Failed to bootstrap scheduler: %s\n", strerror(err));
+		return err;
+	}
+
+	sched_main_loop();
+
+	exit_req = 1;
+	bpf_link__destroy(ops_link);
+	uei_print(&skel->bss->uei);
+	scx_sjf__destroy(skel);
+	return 0;
+}
diff --git a/sched/sjf/scx_sjf_common.h b/sched/sjf/scx_sjf_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..b614f94554955846cebee4fe8bca47f2bf1e5e7f
--- /dev/null
+++ b/sched/sjf/scx_sjf_common.h
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta, Inc */
+
+#ifndef __SCX_USERLAND_COMMON_H
+#define __SCX_USERLAND_COMMON_H
+
+#define USERLAND_MAX_TASKS 60000
+
+#include "cos_client.h"
+#include "cos.h"
+
+/*
+ * An instance of a task that has been enqueued by the kernel for consumption
+ * by a user space global scheduler thread.
+ * task的一个实例,它可以被kernel enqueue,用于被userspace的global scheduler消费
+ */
+struct scx_userland_enqueued_task {
+	__s32 pid;
+	u64 sum_exec_runtime;
+	u64 weight;
+
+	/* 新增 */
+	__s32 tgid; // 代表该线程所属的进程的pid,在bpf程序的enqueue_task_in_user_space中被填写
+	struct entry data;
+};
+
+#endif  // __SCX_USERLAND_COMMON_H
diff --git a/sched/user_exit_info.h b/sched/user_exit_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..e701ef0e0b86c333207d8e315f6656cd3095195c
--- /dev/null
+++ b/sched/user_exit_info.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Define struct user_exit_info which is shared between BPF and userspace parts
+ * to communicate exit status and other information.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifndef __USER_EXIT_INFO_H
+#define __USER_EXIT_INFO_H
+
+struct user_exit_info {
+	int		type;
+	char		reason[128];
+	char		msg[1024];
+};
+
+#ifdef __bpf__
+
+#include "vmlinux.h"
+#include <bpf/bpf_core_read.h>
+
+static inline void uei_record(struct user_exit_info *uei,
+			      const struct scx_exit_info *ei)
+{
+	bpf_probe_read_kernel_str(uei->reason, sizeof(uei->reason), ei->reason);
+	bpf_probe_read_kernel_str(uei->msg, sizeof(uei->msg), ei->msg);
+	/* use __sync to force memory barrier */
+	__sync_val_compare_and_swap(&uei->type, uei->type, ei->type);
+}
+
+#else	/* !__bpf__ */
+
+static inline bool uei_exited(struct user_exit_info *uei)
+{
+	/* use __sync to force memory barrier */
+	return __sync_val_compare_and_swap(&uei->type, -1, -1);
+}
+
+static inline void uei_print(const struct user_exit_info *uei)
+{
+	fprintf(stderr, "EXIT: %s", uei->reason);
+	if (uei->msg[0] != '\0')
+		fprintf(stderr, " (%s)", uei->msg);
+	fputs("\n", stderr);
+}
+
+#endif	/* __bpf__ */
+#endif	/* __USER_EXIT_INFO_H */
diff --git a/tests/simple_test.cpp b/tests/simple_test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9058878eb03fb47e2ec9a3fe899e4a9d298a70fd
--- /dev/null
+++ b/tests/simple_test.cpp
@@ -0,0 +1,106 @@
+#include <sched.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <chrono> // for timing
+#include <vector>
+#include <assert.h>
+
+#include "cos_thread.h"
+
+#define SCHED_EXT 7
+#define SCHED_NORMAL 0
+
+const sched_param param{};
+
+class Timer {
+public:
+  Timer() { startTime_ = std::chrono::high_resolution_clock::now(); }
+  ~Timer() {
+    printf("The test takes %0.1f ms\n", std::chrono::high_resolution_clock::now() - startTime_);
+  }
+private:
+  std::chrono::high_resolution_clock::time_point startTime_;
+};
+
+void TestOne() {
+  printf("\nStarting one worker test...\n");
+  auto t = CosThread(CosThread::KernelSchedulerType::kExt, [] {
+    printf("thread begin...\n");
+    sleep(1);
+    printf("sleep over.\n");
+    
+    std::thread t2(
+        [] { assert(sched_getscheduler(0) == SCHED_EXT); });
+    t2.join();
+  });
+
+  t.WaitUntilInitComplete();
+  sched_setscheduler(t.tid(), SCHED_EXT, &param);
+  t.NotifyWork();
+  t.Join();
+  printf("\nFinish one worker test.\n");
+}
+
+
+void TestMany(int num_workers) {
+  printf("\nStarting many worker test...\n");
+  std::vector<std::unique_ptr<CosThread>> workers;
+
+  for (int i = 0; i < num_workers; i++) {
+    workers.emplace_back(new CosThread(CosThread::KernelSchedulerType::kExt, [] {
+          printf("working...\n");
+          sleep(1);
+    }));
+  }
+
+  for (auto& t : workers) {
+    t->WaitUntilInitComplete();
+    sched_setscheduler(t->tid(), SCHED_EXT, &param);
+    t->NotifyWork();
+  }
+  for (auto& t : workers) t->Join();
+  printf("\nFinish many worker test.\n");
+}
+
+void TestSwitchToCfs() {
+  printf("\nStarting switch-to-cfs test...\n");
+  CosThread t = CosThread(CosThread::KernelSchedulerType::kExt, [] {
+    printf("thread begin...\n");
+    sleep(1);
+    printf("sleep over.\n");
+
+    printf("now switch to CFS...\n");
+    assert(sched_getscheduler(0) == SCHED_EXT);
+
+    assert(sched_setscheduler(0, SCHED_NORMAL, &param) == 0);
+    assert(sched_getscheduler(0) == SCHED_NORMAL);
+    printf("switch to CFS successfully!\n");
+  });
+
+  t.WaitUntilInitComplete();
+  sched_setscheduler(t.tid(), SCHED_EXT, &param);
+  t.NotifyWork();
+  t.Join();
+  printf("\nFinish switch-to-cfs test.\n");
+}
+
+int main(){
+    {
+        printf("***TestOne***\n");
+        Timer t = Timer();
+        TestOne();
+    }
+
+    {
+        printf("***TestMany***\n");
+        Timer t = Timer();
+        TestMany(100);
+    }
+
+    {
+        printf("***TestSwitchToCfs***\n");
+        Timer t = Timer();
+        TestSwitchToCfs();
+    }
+    return 0;
+}
\ No newline at end of file