diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..dbe9c82b3610ccd58d1c681848dcd322e500051e --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.vscode/ \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000000000000000000000000000000000000..87634fa2cb12701860989a0ebeb17374dccd71e9 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,58 @@ +{ + "files.associations": { + "scx_sjf_common.h": "c", + "array": "c", + "atomic": "c", + "bit": "c", + "*.tcc": "c", + "cctype": "c", + "chrono": "c", + "clocale": "c", + "cmath": "c", + "compare": "c", + "concepts": "c", + "cstdarg": "c", + "cstddef": "c", + "cstdint": "c", + "cstdio": "c", + "cstdlib": "c", + "cstring": "c", + "ctime": "c", + "cwchar": "c", + "cwctype": "c", + "deque": "c", + "map": "c", + "string": "c", + "unordered_map": "c", + "vector": "c", + "exception": "c", + "algorithm": "c", + "functional": "c", + "iterator": "c", + "memory": "c", + "memory_resource": "c", + "numeric": "c", + "random": "c", + "ratio": "c", + "string_view": "c", + "system_error": "c", + "tuple": "c", + "type_traits": "c", + "utility": "c", + "initializer_list": "c", + "iosfwd": "c", + "iostream": "c", + "istream": "c", + "limits": "c", + "new": "c", + "numbers": "c", + "ostream": "c", + "semaphore": "c", + "sstream": "c", + "stdexcept": "c", + "stop_token": "c", + "streambuf": "c", + "thread": "c", + "typeinfo": "c" + } +} \ No newline at end of file diff --git a/README.md b/README.md deleted file mode 100644 index 3bd3470e814b09d7d91453c637701fcc2fa6eb15..0000000000000000000000000000000000000000 --- a/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# proj134-CFS-based-userspace-scheduler - -## 说明 - -本仓库是哈尔滨工业大å¦ï¼ˆæ·±åœ³ï¼‰COS队的项目仓库,赛题为[proj134-CFS-based-userspace-scheduler](https://github.com/oscomp/proj134-CFS-based-userspace-scheduler) - diff --git a/client/Makefile b/client/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..add1b1e87d0af5ebf7d773753105ffe07683034e --- /dev/null +++ b/client/Makefile @@ -0,0 +1,23 @@ +CC = g++ +INCLUDES := $(CURDIR)/../lib +CFLAGS = -std=c++11 -Wall -I$(INCLUDES) +LDFLAGS = + +TARGET = simple_client +SRCS = simple_client.cpp +OBJS = $(SRCS:.cpp=.o) + +all: $(TARGET) + +$(TARGET): $(OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) $^ -o $@ + +cos_client: cos_client.o + $(CC) $(CFLAGS) $(LDFLAGS) $^ -o $@ + +%.o: %.cpp + $(CC) $(CFLAGS) $(LDFLAGS) -c $< -o $@ + +clean: + rm -rf $(TARGET) cos_client cos_client.o $(OBJS) shm* + diff --git a/client/cos_client.cpp b/client/cos_client.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b5690cf8d6eead79088d884059ff1681ad1f8a4c --- /dev/null +++ b/client/cos_client.cpp @@ -0,0 +1,63 @@ +#include <sched.h> +#include <vector> +#include <stdio.h> +#include <sys/mman.h> +#include <fcntl.h> // for O_RDWR and open +#include <sstream> +#include <cstring> +#include <string.h> +// include order matters +#include "cos_client.h" +#include "cos.h" +#include "cos_thread.h" + +#include "hash.h" + +#define SCHED_EXT 7 + +struct option{ + int worker_size; +}; + +struct option get_options_from_args(int argc, char** argv){ + return {100}; +} + +int main(int argc, char** argv){ + struct option op = get_options_from_args(argc,argv); + printf("%d:aaaaaaa laotan\n",getpid()); + + char buf[128]; + sprintf(buf,"/etc/cos/shm/shm_%d",getpid()); + int shm_fd = open(buf, O_RDWR | O_CREAT | O_TRUNC,0644); + void* shm = mmap(NULL, SHM_SIZE,PROT_READ|PROT_WRITE, MAP_SHARED,shm_fd,0); + ftruncate(shm_fd, SHM_SIZE); + memset(shm,0,SHM_SIZE); + + LFHashTable<struct entry> hashtable(shm, SHM_SIZE, 0); + std::vector<std::unique_ptr<CosThread>> workers; + for(int i=0;i<op.worker_size;i++){ + workers.emplace_back(new CosThread(CosThread::KernelSchedulerType::kExt, []{ + while(true){ // do some work + printf("%d:working...%d\n",gettid(),sched_getscheduler(gettid())); + sleep(1); + } + })); + } + + for (auto& t : workers) { + t->WaitUntilInitComplete(); + int tid = t->tid(); + hashtable.Add(tid, {tid}); + + struct sched_param param = { .sched_priority = 0 }; + sched_setscheduler(tid, SCHED_EXT, ¶m); + + printf("%d:唤醒了%d\n",getpid(),tid); + t->NotifyWork(); + } + + for (auto& t : workers) t->Join(); + + return 0; +} \ No newline at end of file diff --git a/client/simple_client.cpp b/client/simple_client.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d2f65bd844db96d1a7422e7bb89efc9cab01b1ba --- /dev/null +++ b/client/simple_client.cpp @@ -0,0 +1,48 @@ +#include <sched.h> +#include <unistd.h> +#include <vector> +#include <thread> +#include <stdio.h> +#include <sys/mman.h> +#include <fcntl.h> // for O_RDWR and open +#include <sstream> +#include <cstring> +#include "hash.h" + +#define SCHED_EXT 7 +#define SHM_SIZE 4096 + + +int main(int argc, char** argv) { + + char buf[32]; + sprintf(buf, "/etc/cos/shm/shm_%d", getpid()); + int shm_fd = open(buf, O_RDWR | O_CREAT | O_TRUNC, 0644); + ftruncate(shm_fd, SHM_SIZE); + int* shm = (int*)mmap(NULL, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0); + memset(shm, 0, SHM_SIZE); + + int baba = 10; + LFHashTable<int32_t> hashtable((void*)shm, SHM_SIZE, 0); + std::vector<std::thread> workers(baba); + + for (int i = 0 ; i < baba; i++) { + workers[i] = std::thread ([&hashtable] { + + hashtable.Add(gettid(), gettid()); + + struct sched_param param = { .sched_priority = 0 }; + sched_setscheduler(gettid(),SCHED_EXT,¶m); + + printf("调度类为:%d\n",sched_getscheduler(gettid())); + + while(true){ + printf("%d working...\n", gettid()); + // sleep(1); + } + }); + } + for (int i = 0 ; i < baba; i++) { + workers[i].join(); + } +} \ No newline at end of file diff --git a/gnu/stubs.h b/gnu/stubs.h new file mode 100644 index 0000000000000000000000000000000000000000..719225b1662697f90ab04d9a0f6562e9b4bc34d0 --- /dev/null +++ b/gnu/stubs.h @@ -0,0 +1 @@ +/* dummy .h to trick /usr/include/features.h to work with 'clang -target bpf' */ diff --git a/lib/cos.h b/lib/cos.h new file mode 100644 index 0000000000000000000000000000000000000000..1e334848dc8445b9bc0e841090f47d8fde693e0a --- /dev/null +++ b/lib/cos.h @@ -0,0 +1,7 @@ +struct entry; + +//æ¯å—shmçš„å¤§å° +#define SHM_SIZE 4096 + +// // shm的最大entryæ•° +// #define MAX_ENTRY_NUMS SHM_SIZE / sizeof(struct entry) \ No newline at end of file diff --git a/lib/cos_client.h b/lib/cos_client.h new file mode 100644 index 0000000000000000000000000000000000000000..88a129210eccb2284721de031f0e4b8647e2e2ec --- /dev/null +++ b/lib/cos_client.h @@ -0,0 +1,6 @@ +// 必须详细定义 +// 为了能在agent也获å–到,所以å•独作为一个头文件 +// shmä¸å…·ä½“记录信æ¯çš„结构,æ¯ä¸ªçº¿ç¨‹éƒ½æŒæœ‰ä¸€ä¸ª +struct entry{ + int ddl; +}; \ No newline at end of file diff --git a/lib/cos_thread.h b/lib/cos_thread.h new file mode 100644 index 0000000000000000000000000000000000000000..239a1671b8ce7cea0d653fc8ffc6baeed2432667 --- /dev/null +++ b/lib/cos_thread.h @@ -0,0 +1,79 @@ +#include <thread> +#include <functional>// for std::function +#include <unistd.h> + +class CosThread { + public: + // The kernel scheduling class to run the thread in. + enum class KernelSchedulerType { + // Linux Completely Fair Scheduler. + kCfs, + // ext. + kExt, + }; + + explicit CosThread(KernelSchedulerType ksched, std::function<void()> work) { + work_start_ = false; + ksched_ = ksched; + thread_ = std::thread([this, w = std::move(work)] { + tid_ = gettid(); + NotifyInitComplete(); + + // if (ksched_ == KernelSchedulerType::kExt) { + WaitUntilWork(); + // } + + std::move(w)(); + }); + } + explicit CosThread(const CosThread&) = delete; + CosThread& operator=(const CosThread&) = delete; + ~CosThread() = default; + + // Joins the thread. + void Join() { + thread_.join(); + } + + void WaitUntilWork(){ + while(!work_start_){ + sched_yield(); + } + } + + void NotifyWork(){ + work_start_ = true; + } + + void WaitUntilInitComplete(){ + while(!init_complete_){ + sched_yield(); + } + } + + void NotifyInitComplete(){ + init_complete_ = true; + } + + bool Joinable() const { return thread_.joinable(); } + + // 供外界调用。注æ„ï¼ä¼šé˜»å¡žç›´åˆ°tid被åˆå§‹åŒ– + int tid() { + return tid_; + } + + private: + // æ¡ä»¶å˜é‡ï¼Œè¡¨ç¤ºæ˜¯å¦å¯ä»¥å¼€å§‹workã€‚ç”±ä¸»çº¿ç¨‹æ ‡è®°ä¸ºtrue。 + volatile bool work_start_; + + volatile bool init_complete_; + + // The thread's TID (thread identifier). + int tid_; + + // The kernel scheduling class the thread is running in. + KernelSchedulerType ksched_; + + // The thread. + std::thread thread_; +}; \ No newline at end of file diff --git a/lib/hash.h b/lib/hash.h new file mode 100644 index 0000000000000000000000000000000000000000..82b325497715df4b84770c5eb121f8e71ea67b36 --- /dev/null +++ b/lib/hash.h @@ -0,0 +1,70 @@ +#include <iostream> +#include <atomic> +#include <cassert> + +template <typename THDINFO> +struct Entry { + int32_t key; + THDINFO value; +}; + +// 用于clientå’Œagent之间通信的哈希表,key为tid,value为THDINFO需è¦ä¼ é€’çš„çº¿ç¨‹è°ƒåº¦ä¿¡æ¯ +template <typename THDINFO> +class LFHashTable { +private: + // ç”¨äºŽè¿›ç¨‹é—´é€šä¿¡çš„å…±äº«å†…å˜ + void* shd_mem_; + // 哈希表能å˜å‚¨kv对的个数 + size_t capacity_; + // å˜å‚¨å“ˆå¸Œè¡¨çš„共享内å˜ï¼Œæ˜¯shd_mem_çš„ä¸åŒè¡¨çŽ°å½¢å¼ + Entry<THDINFO>* table_; + // 关键信æ¯ï¼Œä¾‹å¦‚主线程idå·ï¼Œhash函数通过主线程idå·æ¥ç¡®å®šå“ˆå¸Œå‡½æ•° + int32_t key_info_; + +public: + LFHashTable(){} + + LFHashTable(void* shd_mem, size_t mem_size, int32_t key_info): shd_mem_(shd_mem), key_info_(key_info) { + capacity_ = mem_size / sizeof(Entry<THDINFO>); + table_ = reinterpret_cast<Entry<THDINFO>*>(shd_mem); + } + + bool Add(int32_t key, THDINFO value) { + assert(key != 0); + for (int idx = hash(key); ; idx = (idx + 1) % capacity_) { + if (table_[idx].key == 0) { + if (!__sync_bool_compare_and_swap(&table_[idx].key, 0, key)) { + continue; + } + } + if (table_[idx].key != key) { + continue; + } + table_[idx].value = value; + return true; + } + } + + THDINFO Get(int32_t key) { + assert(key != 0); + for (int idx = hash(key); ; idx = (idx + 1) % capacity_) { + if (table_[idx].key == 0) { + return {}; + } + if (table_[idx].key != key) { + continue; + } + return table_[idx].value; + } + } + +private: + int32_t hash(int32_t key) { + int father_tid = key_info_; + return (key - father_tid) % capacity_; + } + +}; + + + diff --git a/record/dzh/face_book.md b/record/dzh/face_book.md deleted file mode 100644 index 6973028385c70c30e0874b7af727c028db80fb73..0000000000000000000000000000000000000000 --- a/record/dzh/face_book.md +++ /dev/null @@ -1,19 +0,0 @@ -长期以æ¥ï¼ŒLinux系统管ç†å‘˜ã€å‘行版开å‘è€…å’Œåº”ç”¨ç¨‹åºæ‰€æœ‰è€…一直在调整ä½äºŽ/proc/sysä¸ï¼ˆçŽ°åœ¨åœ¨debugfsä¸ï¼‰çš„CFS设置。实际上,这些设置的作用是更改任务抢å çš„å¯èƒ½æ€§ï¼Œæˆ–通过将wakeup_granularity_ns设置为大于latency_nsçš„ä¸€åŠæ¥ç¦ç”¨å®ƒã€‚其他设置对性能并没有太大影å“。 - -æ¢å¥è¯è¯´ï¼Œå¯¹äºŽä¸€äº›å·¥ä½œè´Ÿè½½ï¼Œé•¿æ—¶é—´è¿è¡Œçš„任务被处ç†çŸæœŸè¯·æ±‚的任务抢å å¯ä»¥æé«˜æ€§èƒ½ï¼Œè€Œå¯¹äºŽä¸€äº›åªè¿è¡ŒçŸæœŸè¯·æ±‚的工作负载,ä¸è¢«æŠ¢å å而更有利。 - -这引å‘了一些观察和想法: --ä¸åŒçš„工作负载需è¦ä¸åŒçš„ç–略。能够针对æ¯ä¸ªå·¥ä½œè´Ÿè½½è¿›è¡Œé…ç½®å¯èƒ½å¾ˆæœ‰ç”¨ã€‚ --从ä¸è¢«æŠ¢å ä¸èŽ·ç›Šçš„å·¥ä½œè´Ÿè½½ä»ç„¶å¯èƒ½ä»ŽæŠ¢å (低优先级)åŽå°ç³»ç»Ÿä»»åŠ¡ä¸èŽ·ç›Šã€‚ --在生产ä¸å¿«é€Ÿï¼ˆä¸”安全地)å°è¯•ä¸åŒçš„ç–ç•¥ï¼Œè€Œæ— éœ€å…³é—åº”ç”¨ç¨‹åºæˆ–釿–°å¯åŠ¨ç³»ç»Ÿï¼Œä»¥ç¡®å®šä¸åŒå·¥ä½œè´Ÿè½½çš„ç–略,将会很有用。 --åªæœ‰å¾ˆå°‘çš„å·¥ä½œè´Ÿè½½è¶³å¤Ÿå¤§ä¸”æ•æ„Ÿï¼Œéœ€è¦å…¶è‡ªå·±çš„ç–略调整。对于其他所有情况,CFS本身应该足够好,我们å¯èƒ½ä¸æƒ³å°†ç–略调整替æ¢CFS所åšçš„任何事情。 - -这引出了BPFé’©å。在å„ç§å†…æ ¸å系统ä¸ï¼ŒBPFé’©å已被æˆåŠŸç”¨äºŽæä¾›ä¸€ç§å¤–部代ç å®‰å…¨åœ°æ›´æ”¹ä¸€äº›å†…æ ¸å†³ç–的方å¼ã€‚BPF工具使这å˜å¾—相当容易,部署BPF脚本的人已ç»ä¹ æƒ¯äºŽä¸ºæ–°å†…æ ¸ç‰ˆæœ¬æ›´æ–°å®ƒä»¬ã€‚ - -æ¤è¡¥ä¸é›†æ—¨åœ¨å¼€å§‹è®¨è®ºBPF在调度程åºä¸çš„æ½œåœ¨åº”用。它还旨在ç€é™†ä¸€äº›éžå¸¸åŸºæœ¬çš„BPFåŸºç¡€æž¶æž„ï¼Œä»¥æ·»åŠ æ–°çš„BPFé’©å到调度程åºä¸ï¼Œä¸€ç»„最å°çš„æœ‰ç”¨çš„辅助程åºï¼Œç›¸åº”çš„libbpf更改ç‰ç‰ã€‚ - -我们在CFSä¸ä½¿ç”¨BPF的第一次实验看起æ¥éžå¸¸æœ‰å‰é€”。我们处于éžå¸¸æ—©æœŸçš„阶段,但是我们已ç»çœ‹åˆ°äº†æˆ‘们(Facebook的)主è¦Web工作负载的良好延迟和约1ï¼…çš„RPSæå‡ã€‚ - -æ®æˆ‘æ‰€çŸ¥ï¼Œè°·æŒæ£åœ¨è¿›è¡Œä¸€ç§æ›´æ¿€è¿›çš„æ–¹æ³•[2]:他们打算将调度代ç 移动到用户空间。看起æ¥ä»–ä»¬çš„æ ¸å¿ƒåŠ¨æœºæœ‰äº›ç±»ä¼¼ï¼šä½¿è°ƒåº¦å™¨æ›´æ˜“äºŽå¼€å‘ã€éªŒè¯å’Œéƒ¨ç½²ã€‚尽管他们的方法ä¸åŒï¼Œä½†ä»–们也使用BPFæ¥åŠ é€Ÿä¸€äº›çƒç‚¹è·¯å¾„。我认为建议的基础设施也å¯ä»¥ä¸ºä»–们的目的æœåŠ¡ã€‚ - -一个用户空间部分的例åï¼Œå®ƒåŠ è½½äº†ä¸€äº›ç®€å•的挂钩,在这里[3]æä¾›ä»…仅是为了简化使用æä¾›çš„å†…æ ¸è¡¥ä¸çš„æ“ä½œã€‚ diff --git a/record/dzh/ghost_lib.md b/record/dzh/ghost_lib.md deleted file mode 100644 index b541ad140473d7a5b18ffb50a59c53751cbc06a4..0000000000000000000000000000000000000000 --- a/record/dzh/ghost_lib.md +++ /dev/null @@ -1,435 +0,0 @@ -## å‰è¨€ - -enclave, scheduler, agent, channel, message,statusword, statuswordtable, runrequest, task,ghost,cpuã€‚è¿™æœ€æ ¸å¿ƒçš„æ¦‚å¿µåœ¨ghost用户æ€è°ƒåº¦æ¡†æž¶é‡Œçš„关系是? - - - -## Agent - -#### Agent - -explain:论文ä¸çš„agent,和cpu一一对应,属于enclave,和schduler一一对应,也就是说æ¯ä¸€ä¸ªagent都有自己的调度算法 - -``` -StartBegin():agent线程开始执行threadbody,对于localagent是将当å‰agentè¿ç§»åˆ°å…¶ç®¡ç†çš„cpu上 -StartComplete():ç‰å¾…enclave ready -TerminateBegin():通知æ¡ä»¶å˜é‡ -TerminateComplete():摧æ¯çº¿ç¨‹èµ„æº -ThreadBody():agent线程所执行的函数 -AgentThread():在agent相关准备就绪åŽï¼Œç”±ThreadBody调度。由具体的调度算法的agent去实现 -Ping():让agent线程回到它所管ç†çš„cpu上执行 -SignalReady() :在agentåˆå§‹åŒ–结æŸåŽå”¤é†’start_complete(),让其å¯ä»¥æŽ¥ä¸‹æ¥è°ƒç”¨enclaveçš„ready方法 -WaitForEnclaveReady() :ç‰å¾…所在enclaveçš„ready -AgentScheduler():返回agent的调度类,返回空值,被其他继承的调度类é‡å†™ï¼Œå…¶ä»–继承的调度算法会有自己的Scheduler调度类 - -Enclave* enclave_:agent所属enclave -Gtid gtid_:agentçš„ghostçº¿ç¨‹å· -Cpu cpu_:agent所管ç†çš„cpu -Notification ready_, finished_, enclave_ready_, do_exit_ï¼šç›¸å…³ç”¨äºŽåŒæ¥çš„æ¡ä»¶å˜é‡ -std::thread thread_:è¿è¡Œagent的线程 -``` - -#### LocalAgent - -explain:继承agent,供其他调度算法继承,如FifoAgent。é‡å†™äº†ThreadBody(就是上é¢çš„),多了statuswordå—æ®µ - -``` -LocalStatusWord status_word_:通过æ¤å†…æ ¸å…±äº«å†…å˜èŽ·å–相关信æ¯ï¼Œå¦‚cpu空闲,Aseqç‰ -``` - -#### FullAgent - -explain:å°†å•个enclave下的agent,task,scheduler汇集起æ¥ï¼Œåˆ«çš„调度算法会新建一个个性化类去继承这个类,如FullFifoAgent,和enclave一一对应 - -``` -StartAgentTasks():创建当å‰enclave下的所有cpuçš„agent,和它们对应的cpu绑定起æ¥ï¼Œå¹¶ä¸”一次调用它们的StartBegin方法è¿ç§»åˆ°å¯¹åº”cpu上è¿è¡Œ -TerminateAgentTasks()ï¼šè¢«æ´¾ç”Ÿç±»çš„æžæž„方法调用 - -LocalEnclave enclave_:一一对应的enclave -std::vector<std::unique_ptr<Agent>> agents_ï¼šå…¶ä¸æ‰€åŒ…å«çš„agent -``` - - -#### FullFifoAgent(p) - -explain:继承FullAgent - -``` -FullFifoAgent(AgentConfig config):整个调度算法åˆå§‹åŒ–的开始 - -std::unique_ptr<FifoScheduler> scheduler_:按é“ç†æ¥è¯´ï¼Œæ¯ä¸ªagent有一个调度类,也就是说这里应该是一个调度类的listï¼Œä½†æ˜¯è¿™é‡Œåªæœ‰ä¸€ä¸ªï¼Œæˆ‘è§‰å¾—åº”è¯¥æ˜¯å› ä¸ºè°ƒåº¦ç®—æ³•æ˜¯FIFO,所以所有agent应该统一æˆä¸€ä¸ªè°ƒåº¦ç±» -``` - -#### FullFifoAgent(c) - -explain:继承FullAgent - -``` -FullFifoAgent(FifoConfig config):整个调度算法åˆå§‹åŒ–的开始 - -std::unique_ptr<FifoScheduler> scheduler_:按é“ç†æ¥è¯´ï¼Œæ¯ä¸ªagent有一个调度类,也就是说这里应该是一个调度类的listï¼Œä½†æ˜¯è¿™é‡Œåªæœ‰ä¸€ä¸ªï¼Œæˆ‘è§‰å¾—åº”è¯¥æ˜¯å› ä¸ºè°ƒåº¦ç®—æ³•æ˜¯FIFO,所以所有agent应该统一æˆä¸€ä¸ªè°ƒåº¦ç±» -``` - -#### AgentProcess -explain:一个地å€ç©ºé—´ä¸€æ¨¡ä¸€æ ·çš„父å进程,负责è¿è¡ŒFullAgent - -``` -AgentProcess(AgentConfig config)ï¼šæž„é€ å‡ºçˆ¶å两进程,父进程负责幕åŽï¼Œå主线程负责创建agent线程并且ç‰å¾…退出,configå°†ä¼ ç»™FullAgentçš„æž„é€ æ–¹æ³• - -std::unique_ptr<ForkedProcess> agent_proc_:fork系统调用的å°è£… -std::unique_ptr<FullAgent> full_agent_:被è¿è¡Œçš„FullAgent -std::unique_ptr<SharedBlob> sb_:父å进程共享æ¤å†…å˜è¿›è¡Œé€šä¿¡ -``` - - -## Task - -#### Task - -explain:代表一次è¦è¢«è°ƒåº¦ä¸Šcpu的任务,被调度算法继承,如FifoTask - -``` -Gtid gtid:被调度task所代表线程的gtid -LocalStatusWord status_wordï¼šå’Œå†…æ ¸å…±äº«çš„seqç‰ä¿¡æ¯ -Seqnum seqnum:seq -``` - -#### TaskAllocator - -explain:å˜å‚¨task - -#### SingleThreadMallocTaskAllocator - -explain:继承TaskAllocator - -``` - -``` - - - - - -## Sheduler - -#### Sheduler - -explain:åšè°ƒåº¦å†³ç–的类,和agent应该是M对N? - -``` -Scheduler(Enclave* enclave, CpuList cpus):将当å‰è°ƒåº¦ç±»åŠ å…¥enclaveä¸ -EnclaveReady():TODO -DiscoverTasks():TODO -GetDefaultChannel() -GetAgentChannel(const Cpu& cpu) - -Enclave* const enclave_ -CpuList cpus_; -``` - -#### BasicDispatchScheduler - -explain:继承Sheduler,一ç§è°ƒåº¦å™¨å®žçŽ°ï¼Œèƒ½å¤Ÿè§£ç 原始消æ¯ï¼ˆæ¥è‡ªchannel),将它们与任务派生类型相关è”,并调度到适当的调度类方法。其他的调度算法会继承这个类,如FifoScheduler - -``` -BasicDispatchScheduler(Enclave* enclave, CpuList cpus, std::shared_ptr<TaskAllocator<TaskType>> allocator) -void DispatchMessage(const Message& msg) ï¼šå°†æ¶ˆæ¯æ ¹æ®ç±»åž‹è¿›è¡Œç›¸åº”å¤„ç† - -ç›¸åº”å¤„ç†æ–¹æ³•:交给对应调度类去实现 -CpuTick(const Message& msg) -CpuNotIdle(const Message& msg) -CpuTimerExpired(const Message& msg) -CpuAvailable(const Message& msg) -CpuBusy(const Message& msg) -AgentBlocked(const Message& msg) -AgentWakeup(const Message& msg) - -std::shared_ptr<TaskAllocator<TaskType>> const allocator_ -``` - - - - - - - - - - -## Enclave - -#### Enclave - -explain:论文ä¸çš„enclave,上é¢åŒ…å«è¿è¡Œçš„agentå’Œscheduler,cpu拓扑 - -``` -Enclave(const AgentConfig config):通过agentconfigåŽ»æž„é€ enclave -GetRunRequest(const Cpu& cpu)èŽ·å–æŒ‡å®šcpu上的runrequest -CommitRunRequest(RunRequest* req):commitæ¤runrequest,底层调用ghostçš„æŽ¥å£ -SubmitRunRequest(RunRequest* req):submitæ¤runrequest,底层调用ghostçš„æŽ¥å£ -CompleteRunRequest(RunRequest* req):completeæ¤runrequest,底层调用ghost的接å£ï¼Œå’Œä¸Šé¢é‚£ä¸ªæŽ¥å£é…åˆä½¿ç”¨ä¼°è®¡ -LocalYieldRunRequest(const RunRequest* req, BarrierToken agent_barrier, int flags):agent结æŸåœ¨å½“å‰cpu上的调度 -Ready():必须在当å‰enclave上的所有agent和所有schedulerè¢«æž„é€ åŽæ‰èƒ½è°ƒç”¨ -WaitForOldAgent():如果有一个è€agent还在æ¤enclave上,ç‰å¾…直到它退出 -AttachAgent(const Cpu& cpu, Agent* agent) -void DetachAgent(Agent* agent) -AttachScheduler(Scheduler* scheduler) -DetachScheduler(Scheduler* scheduler) - -const AgentConfig config_:代表本enclaveç›¸å…³å‚æ•° -Topology* topology_:机器的cpu拓扑 -CpuList enclave_cpus_:本enclave包å«çš„cpuï¼ï¼ï¼ -std::list<Scheduler*> schedulers_:在enclave上è¿è¡Œçš„schedulers -std::list<Agent*> agents_:在enclave上è¿è¡Œçš„agent -``` - -#### LocalEnclave - -explain:继承enclave,ä¸èƒ½å†è¢«ç»§æ‰¿ - -``` -MakeChannel(int elems, int node, const CpuList& cpulist) -struct CpuRep { - Agent* agent; - LocalRunRequest req; -} -CpuRep cpus_[MAX_CPUS]:cpu与其一一对应的agent,runrequest -ghost_cpu_data* data_region_ï¼šå†…æ ¸å…±äº«é€šä¿¡åŒºåŸŸ -size_t data_region_size_ -int dir_fd_ = -1:enclave相当于目录 -int ctl_fd_ = -1:控制enclaveçš„fd -``` - -## RunRequest - - -#### RunRequestOptions -explain: commit一个txnçš„å‚æ•° - -``` -Gtid target = Gtid(0) //the task to run next -BarrierToken target_barrier //Tseq -BarrierToken agent_barrier = StatusWord::NullBarrierToken() // Aseq -int commit_flags = 0 // controls how a transaction is committed -int run_flags = 0 // control a variety of side-effects when the task either gets oncpu or offcpu -``` - - -#### RunRequest - -explain:代表一次commit的请求容器,容器ä¸è£…的是task,和一个cpu一一对应,底层将调用GhostHelper()->Run()æäº¤æœ¬æ¬¡è¯·æ±‚ - -``` -Init(Enclave* enclave, const Cpu& cpu):åˆå§‹åŒ–runrequest,其所在的enclave和一一对应的cpu -Open(const RunRequestOptions& options):开å¯ä¸€ä¸ªå³å°†è¦æäº¤çš„äº‹åŠ¡ï¼Œç›¸å…³å‚æ•°ä½äºŽoptions -void OpenUnschedule() -void LocalYield(const BarrierToken agent_barrier, const int flags) :Agent must call LocalYield when it has nothing to do -bool Ping() :懂得都懂 -bool Commit() :对ghost commitçš„å°è£… -bool Submit() :对ghost submitçš„å°è£… - - -Enclave* enclave_ -Cpu cpu_; -``` - -#### LocalRunRequest - -explain:继承runrequest - -``` -Init(Enclave* enclave, const Cpu& cpu, ghost_txn* txn):åˆå§‹åŒ–,但是多了代表事务相关信æ¯çš„txn - -ghost_txn* txn_:代表事务相关信æ¯çš„txn -``` - - - - - - - - - -## Cpu - -#### Cpu - -explain:一个cpu的相关信æ¯ï¼Œå¦‚L3Cache,NUMAç‰ - -``` -struct CpuRep { - int cpu; - int core; - int smt_idx; - std::unique_ptr<CpuList> siblings; - std::unique_ptr<CpuList> l3_siblings; - int numa_node; -} -const CpuRep* rep_:cpuç›¸å…³ä¿¡æ¯ -``` - -#### CpuMap -explain:TODO一个代表指导cpu是å¦è¢«è®¾ç½®çš„ä½å›¾ - -#### CpuList -explain:继承CpuMap - - -#### Topology.h -explain:代表机器的cpu拓扑信æ¯ï¼ˆæˆ‘è§‰å¾—åº”è¯¥æ˜¯ä»£è¡¨æ•´ä¸ªæœºå™¨çš„ï¼Œè€Œä¸æ˜¯ä¸€ä¸ªenclave的,也就是说整个机器的cpuä¿¡æ¯éƒ½åœ¨è¿™é‡Œé¢ï¼‰ - -``` -const uint32_t num_cpus_:cpu个数 -CpuList all_cpus_:cpuä½å›¾ -std::vector<Cpu::CpuRep> cpus_:所有cpuä¿¡æ¯ -int highest_node_idx_:numa节点个数 -std::vector<CpuList> cpus_on_node_:å„个numa节点的cpu -``` - - - - - - - - -## Message - -#### Message - -explain:消æ¯é˜Ÿåˆ—ä¸çš„æ¶ˆæ¯ï¼Œè¢«å˜æ”¾åœ¨æ¶ˆæ¯é˜Ÿåˆ—ä¸ç‰å¾…agent或者kernel去消费 - -``` -struct ghost_msg { - uint16_t type; /* message type */ - uint16_t length; /* length of this message including payload */ - uint32_t seqnum; /* sequence number for this msg source */ - uint32_t payload[0]; /* variable length payload */ -}; - -ghost_msg* msg_ -``` - - -## Channel - -#### Channel - -explain:消æ¯é˜Ÿåˆ—ï¼Œå˜æ”¾æ¶ˆæ¯ï¼ŒåŸºäºŽå…±äº«å†…å˜ï¼Œå’ŒCpu关系 TODO - -``` -Peek():获å–队首 -Consume(const Message& msg):弹出队首 -max_elements()ï¼šçŽ¯å½¢é˜Ÿåˆ—å¤§å° -AssociateTask(Gtid gtid, int barrier, int* status)TODO底层调用ghostçš„api -SetEnclaveDefault():将当å‰channe设置为enclave的默认channelTODO -``` - -#### LocalChannel - -explain:继承Channel - -``` -LocalChannel(int elems, int node, CpuList cpulist):底层调用ghostçš„api -struct ghost_queue_header { - uint32_t version; /* ABI version */ - uint32_t start; /* offset from the header to start of ring */ - uint32_t nelems; /* power-of-2 size of ghost_ring.msgs[] */ -} -int fd_:消æ¯é˜Ÿåˆ—çš„fd -ghost_queue_header* header_:队头 -``` - - -## StatusWord - -#### StatusWord - -explainï¼šå’Œå†…æ ¸é€šä¿¡çš„å…±äº«å†…å˜ï¼Œå˜å‚¨Tseq,Aseqç‰ä¿¡æ¯ - -``` -typedef uint32_t BarrierToken - -struct ghost_sw_info { - uint32_t id; /* status_word region id */ - uint32_t index; /* index into the status_word array */ -}; - -struct ghost_status_word { - uint32_t barrier; - uint32_t flags; - uint64_t gtid; - int64_t switch_time; /* time at which task was context-switched onto CPU */ - uint64_t runtime; /* total time spent on the CPU in nsecs */ -} - -ghost_sw_info sw_info_:swçš„idå’Œindex -ghost_status_word* sw_:swä¿¡æ¯ -``` - -#### LocalStatusWord - -explain:上é¢çš„继承 - -#### StatusWordTable - -explain:å˜å‚¨statusword的一å—内å˜åŒºåŸŸ(ps:这里我觉得有必è¦å°†statuswordtableå’Œchannelæ¥ä¸€ä¸ªå¯¹æ¯”ï¼Œå®ƒä»¬éƒ½æ˜¯å’Œå†…æ ¸å…±äº«æ•°æ®ç»“æž„) - -``` -size_t map_size_ = 0; -ghost_sw_region_header* header_ = nullptr; -ghost_status_word* table_ = nullptr; -``` - -#### LocalStatusWordTable - - - - - - - -## Ghost - -#### Ghost - -explain:ghostå†…æ ¸ç›¸å…³æŽ¥å£çš„å°è£… - -``` -原系统调用: -Run(const Gtid& gtid, BarrierToken agent_barrier, BarrierToken task_barrier, const Cpu& cpu, int flags):LocalYieldRunRequestå’ŒPingRunRequest,这两者是干啥呢 -SyncCommit(cpu_set_t& cpuset):SubmitSyncRequests -Commit(cpu_set_t& cpuset):SubmitRunRequests -CreateQueue(int elems, int node, int flags, uint64_t& mapsize):LocalChannelçš„æž„é€ æ–¹æ³• -ConfigQueueWakeup(int queue_fd, const CpuList& cpulist, int flags):LocalChannelçš„æž„é€ æ–¹æ³• -AssociateQueue(int queue_fd, ghost_type type, uint64_t arg, BarrierToken barrier, int flags):LocalChannelçš„AssociateTask -SetDefaultQueue(int queue_fd):SetEnclaveDefault -GetStatusWordInfo(ghost_type type, uint64_t arg, ghost_sw_info& info):LocalStatusWord(StatusWord::AgentSW) - -SchedGetAffinity(const Gtid& gtid, CpuList& cpulist):cfs -SchedSetAffinity(const Gtid& gtid, const CpuList& cpulist):rocksdb? -SchedTaskEnterGhost(int64_t pid, int dir_fd) -SchedAgentEnterGhost(int ctl_fd, const Cpu& cpu, int queue_fd) :Makes calling thread the ghost agent on `cpu`. -``` - -#### GhostSignals - -explain:ghost线程相关信å·å¤„ç†ï¼ˆä¸æ€Žä¹ˆæ¶‰åŠï¼Ÿå…ˆä¸ç®¡ï¼‰ - -#### GhostThread - -explain:原生线程的å°è£…,å¯ä»¥å†³å®šè¢«cfs还是ghost调度。和enclave的关系犹如目录与文件 - -``` -int tid_; -Gtid gtid_; -KernelScheduler ksched_:ghost还是cfs -Notification started_:线程开始è¿è¡Œï¼Œåˆ™è¿™ä¸ªå°†è¢«å”¤é†’ -std::thread thread_:线程 -``` - - - - - diff --git a/record/dzh/ghost_paper_record.md b/record/dzh/ghost_paper_record.md deleted file mode 100644 index 3ec71ad706672f9c980825f628c38b411d38ae17..0000000000000000000000000000000000000000 --- a/record/dzh/ghost_paper_record.md +++ /dev/null @@ -1,358 +0,0 @@ -# ghost论文阅读笔记 - - -## æ¦‚è¦ - -+ çŽ°å¦‚ä»Šäº‰å¯¹ä½¿ç”¨åœºæ™¯å¯¹å†…æ ¸è°ƒåº¦ç–略进行修改,在性能方é¢å¯ä»¥å¾—到很大æå‡ - -+ 但是为å•一使用场景定制特定调度ç–ç•¥çš„å†…æ ¸æ˜¯ä¸åˆ‡å®žé™…的,而且还涉åŠåˆ°é‡å¯å†…æ ¸ï¼Œè¿™ä¼šå¯¼è‡´æ€§èƒ½ï¼Œå¯ç”¨æ€§å¤§å¤§é™ä½Ž - -+ ghostæ˜¯ä¸€ä¸ªè¿™æ ·çš„ç”¨æˆ·æ€è°ƒåº¦æ¡†æž¶ï¼Œé€šè¿‡ç”¨æˆ·æ€agentå’Œå†…æ ¸é€šä¿¡ï¼Œèƒ½å¤Ÿå®žæ—¶çµæ´»å®šåˆ¶ç”¨æˆ·æƒ³è¦çš„夿‚调度ç–略而ä¸éœ€è¦é‡å¯å†…æ ¸ï¼Œå¹¶ä¸”é€‚åº”æ€§å¹¿æ³›ï¼Œæ— è®ºæ˜¯percpu还是centralized - -+ 使用ghost能够增大åžåé‡ï¼Œå‡å°‘å»¶è¿Ÿï¼ŒåŒæ—¶ä¸ºæ•°æ®ä¸å¿ƒå·¥ä½œè´Ÿè½½å¯ç”¨ç–略优化ã€éžä¸æ–å‡çº§å’Œæ•…障隔离。 - - -## 1.ä»‹ç» - -+ 许多特定场景的调度ç–略: - - - Shinjuku request scheduler - - Tableau scheduler - - Caladan scheduler - -+ 在大型工程ä¸éƒ¨ç½²ç‰¹å®šè°ƒåº¦ç–略难度æžå¤§ï¼Œå¾ˆå¯èƒ½é€ æˆå†…æ ¸å´©æºƒï¼Œå°±ç®—éƒ¨ç½²æˆåŠŸï¼Œå¯¹å†…æ ¸å‡çº§ä¹Ÿéœ€è¦åœæœº - -+ 以å‰çš„用户æ€è°ƒåº¦æ¡†æž¶è®¾è®¡æœ‰æ˜Žæ˜¾ç¼ºç‚¹ï¼šå¯¹åº”用部署需è¦ä¿®æ”¹ï¼›éœ€è¦ä¸“é—¨çš„èµ„æºæ—¶æœŸå¯ä»¥é«˜å“应;需è¦é’ˆå¯¹åº”用æ¥ç‰¹å®šä¿®æ”¹å†…æ ¸ - -+ 硬件环境å˜åŒ– - -+ The goal of ghOSt is to fundamentallychange how scheduling policies are designed, implemented, and deployed. ghOSt provides the agility of userspace development and ease of deployment, while still enabling ðœ‡s-scale scheduling - -+ agent是一个osè¿›ç¨‹ï¼Œé€šè¿‡ç›¸å…³ç³»ç»Ÿè°ƒç”¨ä¸Žå†…æ ¸é€šä¿¡ - -+ å†…æ ¸é€šè¿‡å¼‚æ¥æ¶ˆæ¯é˜Ÿåˆ—告诉agent它管ç†çš„线程状æ€å˜åŒ– - -+ agenté€šè¿‡å†…æ ¸ä¼ é€’çš„æ¶ˆæ¯åŒæ¥åœ°å‘Šè¯‰å†…æ ¸è°ƒåº¦ç–ç•¥çš„è½¬å˜ - -+ ghost支æŒå¹¶å‘执行多个调度ç–ç•¥ - -+ ghost相关通信和调度时长很å¯è§‚ - - -### 2.èƒŒæ™¯ä¸Žè®¾è®¡ç›®æ ‡ - -##### 背景 - -+ linuxç›®å‰é‡‡ç”¨cfs调度ç–略,很难针对特定场景进行针对性优化 - -+ å®žçŽ°å†…æ ¸è°ƒåº¦ç–略很难 - -+ éƒ¨ç½²å†…æ ¸è°ƒåº¦ç–略更难 - -+ 用户æ€çº¿ç¨‹çš„调度ç–略是ä¸å¤Ÿçš„ï¼Œå½’æ ¹ç»“åº•å®ƒè¿˜æ˜¯å—åˆ¶äºŽå†…æ ¸è°ƒåº¦ç–ç•¥ - -+ ä¸ºç‰¹å®šåœºæ™¯å®šåˆ¶å†…æ ¸ä¹Ÿæ˜¯ä¸åˆ‡å®žé™… - -+ 通过ebpf去定制调度ç–ç•¥ï¼Ÿä¹Ÿä¸æ˜¯å¾ˆé€‚åˆ - - ebfpå—到诸多é™åˆ¶ï¼Œå¦‚æ ˆå¤§å°ï¼Œå¾ªçŽ¯æ¬¡æ•°ï¼Œè®¿é—®å†…æ ¸æ•°æ®å—é™ - - ebpfæ˜¯åŒæ¥çš„,在调度å‰éœ€è¦é˜»å¡ž - -##### è®¾è®¡ç›®æ ‡ - -+ 容易实现和测试 - -+ 效率高,易表达 - -+ ä¸å±€é™äºŽper-CPU模型 - -+ 支æŒå¤šç§å¹¶å‘ç–ç•¥ - -+ éžä¸æ–更新(ä¸éœ€è¦é‡å¯ï¼‰å’Œé”™è¯¯éš”离 - - -### 3.设计与实现 - -##### 基本ç†å¿µ - -+ ghost概述 - - - 用户æ€agenté€šçŸ¥å†…æ ¸å¦‚ä½•è¿›è¡Œè°ƒåº¦ - - - å†…æ ¸å®žçŽ°é€šè¿‡ç”¨æˆ·æ€ä¿¡æ¯å®žçŽ°ä¸€ä¸ªç±»ä¼¼äºŽcfs的调度类 sheduling class - - - 调度类æä¾›ç”¨æˆ·æ€ä¸€ç»„接å£è®©ç”¨æˆ·æ€åŽ»å®šåˆ¶è°ƒåº¦ç–ç•¥ - - - 为了帮助用户æ€åˆ¤æ–ï¼Œå†…æ ¸å°†ç®¡ç†çº¿ç¨‹çš„状æ€é€šè¿‡æ¶ˆæ¯å’Œçжæ€ç ä¼ é€’ç»™agent - - - 而agent通过系统调用syscall和事务transactioné€šçŸ¥å†…æ ¸è°ƒåº¦ç–ç•¥ - -+ percpuå’Œcentralized概念 - - - percpu:调度åªç®¡æœ¬cpu的调度,有stealç–ç•¥ - - - centralized:全局调度 - -+ cpu与线程的概念 - - - çº¿ç¨‹ï¼šå†…æ ¸çº¿ç¨‹ - - - cpu:执行å•å…ƒ - -+ enclaves - - - 支æŒåœ¨å•机上执行多ç§è°ƒåº¦ç–ç•¥ - - - å› åœ°åˆ¶å®œåˆ†é…cpu(如NUMA架构) - -+ ghost使用agent - - - 用户æ€agentå®žçŽ°æ–¹ä¾¿ï¼Œè°ƒè¯•ç®€å• - - - é…置调度ç–ç•¥æ— éœ€é‡å¯ç³»ç»Ÿ - - - 对于percpu,都有一个agent对应,å¯ä»¥å¯¹æ¯ä¸ªcpué…ç½®ä¸åŒè°ƒåº¦ç–ç•¥ - - - 对于centralized,全局agent对所有cpuè°ƒåº¦è´Ÿè´£ï¼ŒåŒæ—¶è¿˜æœ‰å…¶ä»–䏿´»åŠ¨çš„agent - - - 所有agenté€šè¿‡å†…æ ¸çº¿ç¨‹çš„æ¨¡å¼å®žçŽ°ï¼Œä»–ä»¬åŒå±žäºŽä¸€ä¸ªè¿›ç¨‹ - - -##### å†…æ ¸åˆ°ä»£ç†çš„通信 - -+ 将所线程状æ€ä¼ 递给agent - - - 共享内å˜ï¼Ÿ - - - ç³»ç»Ÿå†…å˜æ–‡ä»¶/proc/pid? - - - API(消æ¯é˜Ÿåˆ—)yes - -+ messageæ¶ˆæ¯ - - - THREAD_CREATED - - THREAD_BLOCKED - - THREAD_PREEMPTED - - THREAD_YIELD - - THREAD_DEAD - - THREAD_WAKEUP - - THREAD_AFFINITYï¼ˆçº¿ç¨‹ç»‘æ ¸ï¼‰ - - TIMER_TICK(确ä¿agent基于最新状æ€åšå†³å®šï¼‰ - -+ mq消æ¯é˜Ÿåˆ— - - - 组织方å¼ï¼šå…±äº«å†…å˜ä¸ä½¿ç”¨è‡ªå®šä¹‰é˜Ÿåˆ— - - - percpuæ¯ä¸ªcpuå’Œagent间有一个åˆå§‹mq - - - centralized所有cpu和全局agent间有一个åˆå§‹mq - - - -+ 线程和mqé—´ç»„ç»‡æ–¹å¼ - - - CREATE/DESTROY_QUEUE:创建/æ‘§æ¯mq - - - ASSOCIATE_QUEUE:修改线程msgå’Œmq之间的å‘é€å…³ç³» - -+ mqå’Œagenté—´ç»„ç»‡æ–¹å¼ - - - CONFIG_QUEUE_WAKEUP:自定义msgåˆ°æ¥æ—¶ï¼Œå¯¹agent的唤醒åŽçš„行为(centralized没有é…ç½®ï¼Œå› ä¸ºå…¨å±€agentä¸èƒ½è¢«é˜»å¡žï¼‰ - -+ 在mq/cpu间移动线程 - - - ASSOCIATE_QUEUE:修改线程msgå’Œmq之间的å‘é€å…³ç³»ï¼Œå¤±è´¥åœºæ™¯ï¼šè¯•图移动的线程还有msg在当å‰mqæ²¡æœ‰å¤„ç† - -+ 在agentså’Œkernelé—´åŒæ¥ - - - 在agentåšè°ƒåº¦ç–略决定的时候,å¯èƒ½åˆæœ‰æ–°çš„msg到æ¥ï¼ˆåŽé¢è®²è¿°å¦‚何解决) - - - Aseqå’ŒTseq的递增æ¡ä»¶ - -+ 通过共享内å˜ä¼ 递seqä¿®æ”¹ä¿¡æ¯ - - -##### 代ç†åˆ°å†…æ ¸çš„é€šä¿¡ - -+ agent通过transaction事务æ¥å’Œå†…æ ¸é€šä¿¡ - - - percpu:一个系统调用接å£è¶³çŸ£ï¼Œcentralizedï¼šæ ¸å¦‚æžœå¾ˆå¤šï¼Œé‚£ä¹ˆä½¿ç”¨ç³»ç»Ÿè°ƒç”¨æ€§èƒ½å°†ä¸‹é™ï¼Œå…±äº«å†…å˜æ›´åˆé€‚ï¼Œæ‰€ä»¥ï¼Œæœ€ç»ˆé‡‡ç”¨å…±äº«å†…å˜æ–¹æ¡ˆ - - - TXN_CREATE - - - TXNS_COMMIT:对于percpu,å‘生context swtich,æ„味ç€å½“å‰agent被替æ¢ä¸ºè¦è¿è¡Œçš„线程 - -+ Group commitsï¼ˆæ‰¹é‡æäº¤ï¼‰ - - - 对于centralized调度,å•个æäº¤ä¼šå¯¼è‡´æ€§èƒ½å¤§å¤§ä¸‹é™ - -+ seqæ ¸äº‹åŠ¡ - - - 在agentåšè°ƒåº¦ç–略决定的时候,å¯èƒ½åˆæœ‰æ–°çš„msg到æ¥ï¼ˆåŽé¢è®²è¿°å¦‚何解决),并且该msgå¯èƒ½æ¥è‡ªé«˜ä¼˜å…ˆçº§çº¿ç¨‹ï¼Œå½“å‰agent处于runningï¼Œæ— æ³•å”¤é†’é€šçŸ¥ - - - 1)Read Aseq - - - 2)读å–msq - - - 3)决定调度ç–ç•¥ - - - 4)commit,若commit的最新Aseqæ¯”å†…æ ¸è§‚æµ‹åˆ°çš„æœ€æ–°Aseqå°ï¼Œé‚£ä¹ˆcommit失败 - -+ 通过ebpfåŠ é€Ÿ - - - cpu空闲,但是agent没有调度线程时,ebpf会选择线程è¿è¡Œ - -##### centralized 调度 - -+ é¿å…全局agent线程被抢å - - - 全局agentä¼˜å…ˆçº§æœ€é«˜ï¼Œæ— è®ºghost还是éžghost,没有任何线程能抢å - - - é€ æˆè´Ÿé¢å½±å“:æ¯ä¸ªçº¿ç¨‹å˜åœ¨ç»‘定的工作线程 - - - 通过切æ¢åˆ°inactive çš„agent解决 - -+ sqlå’Œcentralized调度 - - - 判æ–Tseq是å¦ä¸ºæœ€æ–° - - -##### 故障隔离与动æ€å‡çº§ - -+ å’Œå†…æ ¸å…¶ä»–è°ƒåº¦ç–略的关系 - - - ä¼˜å…ˆçº§ä½ŽäºŽå†…æ ¸åŽŸç”Ÿè°ƒåº¦ç±»ï¼Œå¦‚cfs - -+ åŠ¨æ€æ›´æ–°ä¸Žå›žæ»š - - - 替æ¢agent,ä¿ç•™enclave:新旧agent - - - æ‘§æ¯enclave,从头开始:摧æ¯å½“å‰enclave下所有agent,相关线程é€å›žå†…æ ¸é»˜è®¤è°ƒåº¦ - - -+ 看门狗 - - - æ‘§æ¯ä¸è¿›è¡Œçº¿ç¨‹è°ƒåº¦çš„enclave - - -### 4.评估和对比 - -三个问题: - -+ ghost相比于其他调度器有啥é¢å¤–开销 - -+ 和之å‰çš„调度器相比 - -+ ghost是解决大规模低延迟工作负载,比如 Google Snap, Google Searchå’Œvirtual machinesçš„å¯è¡Œæ–¹æ¡ˆå— - -##### ghost的开销 - -+ 代ç é‡ï¼šå°‘,而且高级è¯è¨€é€šè¿‡è°ƒåº“使得代ç 釿›´å°‘ - -+ 消æ¯ä¼ 递开销 - -+ 本地调度开销(percpu) - -+ 远程调度开销(centralized) - æ¯ç§’æ¯ä¸ªcpuå¯ä»¥25200个线程(100个cpu),线程40us,能让所有cpuç¹å¿™ã€‚ - éšç€agent个数增多,这个数æ®ä¹Ÿæ˜¯çº¿æ€§å¢žé•¿ - -+ 全局agent性能分æžï¼šå…¨å±€agent调度其他线程,统计cpuæ•°é‡å’Œcommit个数的关系 - - 第一次drop:工作线程和全局代ç†ç«žäº‰ - - 第二次dropï¼šæ ¸é—´é€šä¿¡å¼€é”€å¤§äºŽæ ¸å¢žåŠ å¼€é”€ - -##### 和其他定制的centralized调度类比较 - -+ é…ç½® - - - åˆå§‹Shinjukuï¼šè°ƒåº¦çº¿ç¨‹ç»‘ç‰©ç†æ ¸ï¼Œ20ä¸ªå·¥ä½œçº¿ç¨‹ç»‘é€»è¾‘æ ¸ï¼ˆåˆ«çš„çº¿ç¨‹ä¸èƒ½åœ¨æ¤æ ¸ä¸Šè¿è¡Œï¼‰ - - 基于ghost:710行代ç ,应该全局agent线程,200个工作线程(都ä¸ç»‘æ ¸ï¼Œæ‰€ä»¥å…许空闲cpu调度其他线程) - - éžæŠ¢å 基于cfsçš„Shinjuku - - -+ å•一调度:åªè°ƒåº¦rocksdb - - ghost比åˆå§‹ä»£ç é‡å°‘82% - - 延迟ghostç¨å¾®é«˜ - - 饱和åžåé‡ç›¸å·®ä¸åˆ°5% - - åŽŸå› ï¼šå¯¹äºŽæ¯æ¬¡è¯·æ±‚,ghost都得调度一次;åˆå§‹åªè¦åœ¨20个pinä½çš„å·¥ä½œçº¿ç¨‹ä¸Šä¼ é€’è°ƒåº¦è¯·æ±‚å³å¯ - - - cfs由于ä¸èƒ½æŠ¢å ,太费 - -+ 多ç§è°ƒåº¦ï¼š - - åˆå§‹ç”±äºŽä¸èƒ½æŽ§åˆ¶å…¶ä»–cpu线程,当负载低的时候也ä¸èƒ½å°†cpu放出 - - 试Shenango?ä¸é€‚åˆæ‰§è¡Œæ—¶é—´æ—¶åˆ»å˜åŒ–的场景,导致延迟相比于Shinjukuæ›´ä¹… - - åžå釿¯”较和å•一调度差ä¸å¤šï¼Œä½†æ˜¯cpushare,ghostå¯ä»¥å°†ç©ºé—²cpu分享给其他线程 - - -个人总结:延迟ghost略逊,但是对于将cpu分享给其他线程这å—,ghoståšçš„æ›´å¥½ï¼ŒShinjukuä¸ä¼šåˆ†äº«ç©ºé—²cpu - -##### Google Snap - -+ 实时工作线程如何调度? - MicroQuantaä¿è¯åœ¨ä»»ä½•一个时间周期内(例如1ms)将cpu分享给指定工作线程:兼顾实时调度工作线程和ä¸è®©å…¶ä»–线程挨饿(个人解读:寻找一个æ°å¥½åˆé€‚的周期,能够兼顾二者) - - -+ 测试环境 - - 6对客户-æœåŠ¡å™¨æ¨¡åž‹çº¿ç¨‹ï¼Œ1个64b,å¦å¤–5个64kb - - 两ç§å®žéªŒæµ‹è¯•:quietåªæœ‰å·¥ä½œçº¿ç¨‹ï¼›loaded还有其他40竞争线程 - -+ ghosté…ç½® - - ghost的部署:全局agent线程管ç†å·¥ä½œçº¿ç¨‹å’Œå…¶ä»–竞争线程,snap工作线程具有高优先级 - - ghost管ç†çš„线程会被cfs调度的客户-æœåŠ¡å™¨æ¨¡åž‹çº¿ç¨‹å’ŒåŽŸç”Ÿçº¿ç¨‹æŠ¢å - - 竞争线程åªä¼šåœ¨ç©ºé—²æ—¶è¢«è°ƒåº¦ - -+ 延迟比较: - - quiet:64b ghostè¦å¥½ï¼Œ64kbåˆå§‹è¦å¥½ 64kbæ‹·è´æ—¶é—´è¿‡å¤šå¯¼è‡´è°ƒåº¦äº‹ä»¶è¿‡å°‘ - - loaded: åŒä¸Šï¼Œä½†æ˜¯64kb ghostçš„å·®è·æ²¡64b大 - -个人总结:ghostèƒ½å¤Ÿçµæ´»è°ƒåº¦çº¿ç¨‹åŽ»ç©ºé—²cpu,一个特别简å•çš„ghosté…置就能接近大é‡ä¿®æ”¹å†…æ ¸çš„MicroQuanta调度ç–ç•¥ - -##### Google Search - -+ 测试环境 - - - cpuå’Œå†…å˜ æ•æ„Ÿ - - SSDå’Œå†…å˜æ•感 - - cpuæ•æ„Ÿ - -+ ghosté…ç½® - - - 全局agent调度256个cpu,NUMAå’ŒCCX情况需è¦è€ƒè™‘,å‰è€…有自己内å˜ï¼ŒåŽè€…有自己L3cache - -+ QPS和延迟比较 - - - QPS二者差ä¸å¤šï¼ŒäºŒè€…都能针对NUMAå’ŒCCX情况优化,但是CFS需è¦é‡å¯ - - - 在延迟上,ghost明显好于cfs,除了第三ç§ã€‚CFS以毫秒级平衡线程调度,ghost则会在微秒级åšå‡ºå应。对于第三ç§ï¼Œä¹Ÿèƒ½é€šè¿‡é‡æ–°åˆ¶å®šè°ƒåº¦ç–略优化 - -+ 快速进行实验的ç»éªŒ - - - å¯¹å†…æ ¸ä¿®æ”¹åŽé‡æ–°ç¼–译花费很长时间,æ¯å¤©åªèƒ½å¹²5次 - - - ghostä¸éœ€ä¿®æ”¹é‡å¯å†…æ ¸ï¼Œåªéœ€è¦1分钟 - -个人总结:ghost低延迟,特定场景对空闲cpu的掌控能力使得其延迟很凶残 - -##### ä¿æŠ¤è™šæ‹Ÿæœºï¼Œé˜²æ¢L1TF/MDS攻击 - -psï¼šè¿™ä¸¤ä¸ªæ”»å‡»æˆ‘éƒ½ä¸æ‡‚,ä¹ï¼Œå¤§æ¦‚是å¯ä»¥ä»Žè¿è¡Œåœ¨åŒä¸€ä¸ªè¶…线程的å¦ä¸€ä¸ªè™šæ‹Ÿæœºä¸çªƒå–æ•°æ®ã€‚è§£å†³åŠžæ³•æ˜¯ç¡®ä¿æ¯ä¸ªè™šæ‹Ÿæœºçš„æ¯ä¸ªè™šæ‹Ÿcpuåªè¿è¡Œåœ¨ç‰¹å®šçš„ç‰©ç†æ ¸ä¸Šã€‚ Microarchitectural buffers在切æ¢å…¶ä»–虚拟机的时候需è¦è¢«æ¸…空 - -+ per-core è°ƒåº¦ï¼šè®©ä¸€ä¸ªæ ¸ä¸Šè¿è¡Œç›¸åŒè™šæ‹Ÿæœºçš„cpu - -+ 性能,差ä¸å¤šï¼Œä½†æ˜¯åŽä¸¤ä¸ªè¦å®‰å…¨ - -个人总结:针对攻击能够制定特定调度ç–é¿å…被攻击 - -### 5.未æ¥å·¥ä½œ - -+ 使用ebpfåŠ é€Ÿ - -+ 关闿—¶é—´ä¸æ– - -### 6.相关工作 - -没啥好看的 - -### 7.结论 - -没啥好说的 - diff --git a/record/dzh/google.md b/record/dzh/google.md deleted file mode 100644 index 7bae08bfa1bf3669e8959cbb567c468cb5514345..0000000000000000000000000000000000000000 --- a/record/dzh/google.md +++ /dev/null @@ -1,77 +0,0 @@ - - -æ¯«æ— ç–‘é—®ï¼Œæœ‰äººä¼šå°è¯•å°†BPFå¼•å…¥å†…æ ¸çš„CPUè°ƒåº¦å™¨ï¼Œè¿™åªæ˜¯æ—¶é—´é—®é¢˜ã€‚在1月底,Tejun Heo与David Vernetã€Josh Donå’ŒBarret Rhodenåˆä½œå‘布了30个补ä¸ç³»åˆ—çš„ç¬¬äºŒç‰ˆï¼Œæ—¨åœ¨å®žçŽ°è¿™ä¸€ç›®æ ‡ã€‚å°†è°ƒåº¦å†³ -ç–延迟到BPF程åºä¸å¯èƒ½ä¼šæœ‰ä¸€äº›æœ‰è¶£çš„事情,但è¦è®©æ•´ä¸ªå¼€å‘社区接å—这个想法å¯èƒ½éœ€è¦ä¸€äº›å·¥ä½œã€‚ - -BPFçš„æ ¸å¿ƒæ€æƒ³æ˜¯å…许程åºåœ¨è¿è¡Œæ—¶ä»Žç”¨æˆ·ç©ºé—´åŠ è½½åˆ°å†…æ ¸ä¸ï¼›ä½¿ç”¨BPF进行调度具有潜力使得调度行为与目å‰åœ¨Linux系统ä¸çœ‹åˆ°çš„æœ‰å¾ˆå¤§ä¸åŒã€‚â€œå¯æ’æ‹”â€çš„è°ƒåº¦å™¨æ¦‚å¿µå¹¶ä¸æ˜¯æ–°é²œçš„;例如,在2004年的一 -次讨论ä¸ï¼ŒCon Kolivasæå‡ºäº†ä¸€ç³»åˆ—注定失败的补ä¸ï¼Œå…¶ä¸æ¶‰åŠåˆ°å¯æ’æ‹”çš„è°ƒåº¦å™¨ã€‚å½“æ—¶ï¼Œè¿™ä¸ªå¯æ’拔调度器的想法å—到了强烈的åå¯¹ï¼›å› ä¸ºåªæœ‰å°†ç²¾åЛ集ä¸åœ¨å•个调度器上,开å‘社区æ‰èƒ½æ‰¾åˆ°ä¸€ç§æ–¹ -å¼ï¼Œæ»¡è¶³æ‰€æœ‰å·¥ä½œè´Ÿè½½ï¼Œè€Œä¸ä¼šå°†å†…æ ¸å¡«æ»¡å„ç§ç‰¹æ®Šç›®çš„的调度器的混乱。 - -å½“ç„¶ï¼Œå†…æ ¸åªæœ‰ä¸€ä¸ªCPU调度器的想法并ä¸å®Œå…¨å‡†ç¡®ï¼›å®žé™…ä¸Šï¼Œè¿˜æœ‰å‡ ä¸ªè°ƒåº¦å™¨å¯ä¾›åº”用程åºé€‰æ‹©ï¼ŒåŒ…æ‹¬å®žæ—¶è°ƒåº¦å™¨å’Œæˆªæ¢æ—¶é—´è°ƒåº¦å™¨ã€‚但是,在Linuxç³»ç»Ÿä¸Šå‡ ä¹Žæ‰€æœ‰çš„å·¥ä½œéƒ½åœ¨é»˜è®¤çš„â€œå®Œå…¨å…¬å¹³è°ƒåº¦å™¨â€ -下è¿è¡Œï¼Œå®ƒç¡®å®žåœ¨å„ç§ä»ŽåµŒå…¥å¼ç³»ç»Ÿåˆ°è¶…çº§è®¡ç®—æœºçš„å·¥ä½œè´Ÿè½½ç®¡ç†æ–¹é¢éƒ½åšå¾—很好。人们总是渴望更好的性能,但多年æ¥å‡ ä¹Žæ²¡æœ‰è¦æ±‚æä¾›å¯æ’拔调度器机制的请求。 - -那么,为什么现在æå‡ºBPF机制呢?为了é¿å…长时间的讨论,这个补ä¸ç³»åˆ—的说明信详细æè¿°äº†è¿™é¡¹å·¥ä½œçš„动机。简而言之,这个论点是,使用BPF编写调度ç–ç•¥æžå¤§åœ°é™ä½Žäº†å°è¯•新的调度方法的难度。自完 -全公平调度器问世以æ¥ï¼Œæˆ‘们的工作负载和è¿è¡Œå®ƒä»¬çš„系统å˜å¾—æ›´åŠ å¤æ‚;需è¦è¿›è¡Œå®žéªŒæ¥å¼€å‘适åˆå½“å‰ç³»ç»Ÿçš„调度算法。BPF调度类å¯ä»¥ä»¥å®‰å…¨çš„æ–¹å¼è¿›è¡Œå®žéªŒï¼Œç”šè‡³æ— éœ€é‡æ–°å¯åŠ¨æµ‹è¯•æœºå™¨ã€‚ä½¿ç”¨BPF编写 -的调度器还å¯ä»¥æé«˜é’ˆå¯¹æŸäº›ç‰¹å®šå·¥ä½œè´Ÿè½½çš„æ€§èƒ½ï¼Œè¿™äº›å·¥ä½œè´Ÿè½½å¯èƒ½ä¸å€¼å¾—åœ¨ä¸»çº¿å†…æ ¸ä¸æ”¯æŒï¼Œå¹¶ä¸”部署到大型系统集群ä¸ä¹Ÿæ›´åŠ å®¹æ˜“ã€‚ - -## Scheduling with BPF - -这个补ä¸é›†æ·»åŠ äº†ä¸€ä¸ªå为SCHED_EXT的新调度类,å¯ä»¥é€šè¿‡ç±»ä¼¼äºŽå¤§å¤šæ•°å…¶ä»–调用sched_setscheduler()的调用æ¥é€‰æ‹©å®ƒï¼ˆé€‰æ‹©SCHED_DEADLINEæœ‰ç‚¹æ›´åŠ å¤æ‚)。它是一个éžç‰¹æƒç±»ï¼Œè¿™æ„味ç€ä»»ä½•进程 -都å¯ä»¥å°†è‡ªå·±ç½®äºŽSCHED_EXTä¸ã€‚SCHED_EXTè¢«æ”¾ç½®åœ¨ä¼˜å…ˆçº§å †æ ˆä¸çš„空闲类(SCHED_IDLE)和完全公平调度器(SCHED_NORMALï¼‰ä¹‹é—´ã€‚å› æ¤ï¼Œæ²¡æœ‰SCHED_EXT调度器å¯ä»¥ä»¥ä¸€ç§é˜»æ¢ä¾‹å¦‚以SCHED_NORMAL -è¿è¡Œçš„æ™®é€šshell会è¯è¿è¡Œçš„æ–¹å¼æŽ¥ç®¡ç³»ç»Ÿã€‚它还建议,在使用SCHED_EXT的系统上,期望大部分工作负载将在该类ä¸è¿è¡Œã€‚ - -BPF编写的调度程åºå¯¹æ•´ä¸ªç³»ç»Ÿæ˜¯å…¨å±€çš„;没有为ä¸åŒçš„è¿›ç¨‹ç»„åŠ è½½è‡ªå·±çš„è°ƒåº¦ç¨‹åºçš„è§„å®šã€‚å¦‚æžœæ²¡æœ‰åŠ è½½BPF调度程åºï¼Œåˆ™æ”¾ç½®åœ¨SCHED_EXTç±»ä¸çš„任何进程将åƒåœ¨SCHED_NORMALä¸ä¸€æ ·è¿è¡Œã€‚然而,一旦 -åŠ è½½äº†BPF调度程åºï¼Œå®ƒå°†æŽ¥ç®¡æ‰€æœ‰SCHED_EXT任务的责任。还有一个神奇的函数,BPF调度程åºå¯ä»¥è°ƒç”¨ï¼ˆscx_bpf_switch_all()),它将所有è¿è¡Œåœ¨å®žæ—¶ä¼˜å…ˆçº§ä»¥ä¸‹çš„进程移动到SCHED_EXTä¸ã€‚ - -实现调度程åºçš„BPF程åºé€šå¸¸ä¼šç®¡ç†ä¸€ç»„调度队列,æ¯ä¸ªé˜Ÿåˆ—都å¯èƒ½åŒ…å«ç‰å¾…在CPU上执行的å¯è¿è¡Œä»»åŠ¡ã€‚é»˜è®¤æƒ…å†µä¸‹ï¼Œç³»ç»Ÿä¸æ¯ä¸ªCPU都有一个调度队列和一个全局队列。当CPU准备好è¿è¡Œæ–°ä»»åŠ¡æ—¶ï¼Œè°ƒåº¦ -程åºå°†ä»Žç›¸åº”的调度队列ä¸å–出一个任务并将其分é…ç»™CPU。调度程åºçš„BPF部分大多实现为一组通过æ“作结构调用的回调函数,æ¯ä¸ªå›žè°ƒå‡½æ•°é€šçŸ¥BPF代ç 需è¦è¿›è¡Œçš„事件或决ç–。该列表很长,完整的列表 -å¯ä»¥åœ¨SCHED_EXTå˜å‚¨åº“分支的include/sched/ext.h䏿‰¾åˆ°ã€‚该列表包括: - - 当一个新的任务进入SCHED_EXT时,prep_enable()å’Œenable()这两个回调函数将通知调度程åºã€‚prep_enable()å¯ä»¥ç”¨äºŽä¸ºè¯¥ä»»åŠ¡è®¾ç½®ä»»ä½•ç›¸å…³æ•°æ®ï¼Œå®ƒå¯ä»¥é˜»å¡žå¹¶ç”¨äºŽå†…å˜åˆ†é…。enable()则 - æ— æ³•é˜»å¡žï¼Œå®ƒå®žé™…ä¸Šå¯ç”¨äº†æ–°ä»»åŠ¡çš„è°ƒåº¦ã€‚ - - select_cpu()回调函数用于为刚刚唤醒的任务选择一个CPU,并返回è¦å°†ä»»åŠ¡æ”¾ç½®åœ¨çš„CPUç¼–å·ã€‚这个决ç–å¯ä»¥åœ¨ä»»åŠ¡å®žé™…è¿è¡Œä¹‹å‰é‡æ–°å®¡è§†ï¼Œä½†å®ƒå¯èƒ½è¢«è°ƒåº¦ç¨‹åºç”¨äºŽå”¤é†’选择的CPU(如果它当 - å‰å¤„于空闲状æ€ï¼‰ã€‚ - - enqueue()å›žè°ƒå‡½æ•°å°†ä¸€ä¸ªä»»åŠ¡åŠ å…¥è°ƒåº¦ç¨‹åºä»¥è¿›è¡Œè¿è¡Œã€‚通常,该回调将调用scx_bpf_dispatch()将任务放置到选择的调度队列ä¸ï¼Œè¯¥é˜Ÿåˆ—最终将在任务è¿è¡Œæ—¶ä¸ºå…¶æä¾›æ—¶é—´ç‰‡é•¿åº¦ã€‚如果将片 - 长指定为SCX_SLICE_INF,则在æ¤ä»»åŠ¡è¿è¡Œæ—¶ï¼ŒCPUå°†è¿›å…¥æ— èŠ‚æ‹æ¨¡å¼ã€‚ - - 值得注æ„的是,enqueue()ä¸å¿…将任务放入任何调度队列;如果任务ä¸åº”ç«‹å³è¿è¡Œï¼Œå®ƒå¯ä»¥å°†ä»»åŠ¡æš‚æ—¶æ”¾åœ¨æŸä¸ªåœ°æ–¹ã€‚ä½†å†…æ ¸ä¼šè·Ÿè¸ªè¿™äº›ä»»åŠ¡ï¼Œä»¥ç¡®ä¿æ²¡æœ‰ä»»åŠ¡è¢«é—忘;如果任务滞留时间过长(默 - 认为30秒,但超时时间å¯ä»¥ç¼©çŸï¼‰ï¼ŒBPFè°ƒåº¦ç¨‹åºæœ€ç»ˆå°†è¢«å¸è½½ã€‚ - - 当一个CPU的调度队列为空时,调用dispatch()回调函数将任务分派到该队列ä¸ä»¥ä¿æŒCPU忙碌。如果调度队列ä»ç„¶ä¸ºç©ºï¼Œè°ƒåº¦ç¨‹åºå°†å°è¯•从全局队列ä¸èŽ·å–任务。 - - update_idle()回调函数将通知调度程åºä¸€ä¸ªCPU何时进入或离开空闲状æ€ã€‚ - - runnable()ã€running()ã€stopping()å’Œquiescent()回调函数分别通知调度程åºä»»åŠ¡çš„çŠ¶æ€æ›´æ”¹ã€‚它们分别在任务å˜ä¸ºå¯è¿è¡Œã€åœ¨CPU上开始è¿è¡Œã€ä»ŽCPU上被å–下或å˜ä¸ºä¸å¯è¿è¡Œæ—¶è°ƒç”¨ã€‚ - - cpu_acquire()å’Œcpu_release()回调函数通知调度程åºç³»ç»Ÿä¸CPU的状æ€ã€‚当一个CPU对BPF调度程åºå¯ç”¨æ—¶ï¼Œå›žè°ƒå‡½æ•°cpu_acquire()将通知它这个事实。当一个CPUä¸å¯ç”¨æ—¶ï¼ˆä¾‹å¦‚,一个实时 - 调度类å¯èƒ½å·²ç»ä½¿ç”¨å®ƒï¼‰ï¼Œå°†é€šè¿‡è°ƒç”¨cpu_release()æ¥é€šçŸ¥å®ƒã€‚ - - -还有许多其他的回调函数用于控制组的管ç†ã€CPUäº²å’Œæ€§ã€æ ¸å¿ƒè°ƒåº¦ç‰ã€‚æ¤å¤–,还有一组函数å¯ä¾›è°ƒåº¦ç¨‹åºè°ƒç”¨ä»¥å½±å“调度决ç–;例如,scx_bpf_kick_cpu() å¯ç”¨äºŽæŠ¢å æ£åœ¨ç»™å®šCPU上è¿è¡Œçš„任务,并回 -调调度程åºä»¥é€‰æ‹©åœ¨è¯¥CPU上è¿è¡Œçš„æ–°ä»»åŠ¡ã€‚ - -## Examples - -最终的结果是一个框架,å…许在 BPF 代ç ä¸å®žçްå„ç§è°ƒåº¦ç–ç•¥ã€‚ä¸ºäº†è¯æ˜Žè¿™ä¸€ç‚¹ï¼Œè¿™ä¸ªè¡¥ä¸ç³»åˆ—包å«äº†è®¸å¤šç¤ºä¾‹è°ƒåº¦å™¨ã€‚å…¶ä¸ä¸€éƒ¨åˆ†æ˜¯ä¸€ä¸ªæœ€å°çš„“虚拟â€è°ƒåº¦å™¨ï¼Œå®ƒä½¿ç”¨é»˜è®¤çš„回调函数;å¦ä¸€ä¸ªåˆ™æ˜¯ä¸€ä¸ª -基本调度器,实现了五个优先级级别,并展示了如何将任务å˜å‚¨åˆ° BPF æ˜ å°„ä¸ã€‚â€œè™½ç„¶ä¸æ˜¯å¾ˆå®žç”¨ï¼Œä½†å®ƒä½œä¸ºä¸€ä¸ªç®€å•的示例很有用,并将用于演示ä¸åŒçš„功能â€ã€‚ - -æ¤å¤–,还有一个“ä¸å¤®â€è°ƒåº¦ç¨‹åºï¼Œå®ƒå°†ä¸€ä¸ªCPU专用于调度决ç–,使得所有其他CPU都å¯ä»¥è‡ªç”±è¿è¡Œå·¥ä½œè´Ÿè½½ã€‚åŽç»çš„è¡¥ä¸ä¸ºè¯¥è°ƒåº¦ç¨‹åºæ·»åŠ äº†tickless支æŒï¼Œå¹¶æ€»ç»“é“: - - 尽管 scx_example_central本身ä¸è¶³ä»¥ç”¨ä½œç”Ÿäº§è°ƒåº¦ç¨‹åºï¼Œä½†å¯ä»¥ä½¿ç”¨ç›¸åŒçš„æ–¹æ³•构建更具特色的ä¸å¤®è°ƒåº¦ç¨‹åºã€‚Google çš„ç»éªŒè¡¨æ˜Žï¼Œè¿™ç§æ–¹æ³•对æŸäº›åº”用程åºï¼ˆå¦‚ VM 托管)具有é‡è¦çš„好处 - -æ¤å¤–,scx_example_pair é‡‡ç”¨æŽ§åˆ¶ç»„å®žçŽ°äº†ä¸€ç§æ ¸å¿ƒè°ƒåº¦å½¢å¼ã€‚scx_example_userland 调度程åºâ€œåœ¨ç”¨æˆ·ç©ºé—´å®žçŽ°äº†ä¸€ä¸ªç›¸å½“ä¸æˆç†Ÿçš„æŽ’åºåˆ—表 vruntime 调度程åºï¼Œä»¥æ¼”示大多数调度决ç–如何委托给 -用户空间â€ã€‚该系列最åŽä»‹ç»äº† Atropos 调度程åºï¼Œå®ƒå…·æœ‰ç”¨ Rust 编写的é‡è¦çš„用户空间组件。信件å°é¢è¿˜ä»‹ç»äº†å¦ä¸€ä¸ªè°ƒåº¦ç¨‹åº scx_example_cgfifoï¼Œå› ä¸ºå®ƒä¾èµ–äºŽä»æœªåˆå¹¶åˆ°ä¸»çº¿çš„ BPF rbtree -è¡¥ä¸è€Œæœªè¢«åŒ…å«åœ¨è¯¥ç³»åˆ—ä¸ã€‚它“为å„个工作负载æä¾› FIFO ç–略,并æä¾›æ‰å¹³åŒ–分层 vtree 用于控制组â€ï¼Œæ˜¾ç„¶åœ¨ Apache Web æœåŠ¡åŸºå‡†æµ‹è¯•ä¸æ¯” SCHED_NORMAL æä¾›æ›´å¥½çš„æ€§èƒ½ã€‚ - -## Prospects - -这个补ä¸é›†ç›®å‰å·²ç»å‘布了第二个版本,并且迄今为æ¢è¿˜æ²¡æœ‰å¼•èµ·å¾ˆå¤šè¯„è®ºï¼Œä¹Ÿè®¸å¤ªå¤§äº†ï¼Œæ— æ³•è¿›è¡Œè¾©è®ºã€‚ç„¶è€Œï¼Œè°ƒåº¦å™¨ç»´æŠ¤è€…Peter Zijlstra在第一个版本ä¸å›žåº”说:“我讨厌所有这些。Linus在过去多 -次å¦å†³äº†å¯åŠ è½½çš„è°ƒåº¦å™¨ï¼Œè¿™åªæ˜¯åˆä¸€æ¬¡â€”â€”åŠ ä¸Šäº†æ•´ä¸ªBPF问题的é¢å¤–缺陷。â€ç„¶è€Œï¼Œä»–ç»§ç»å®¡æŸ¥äº†è®¸å¤šç»„æˆè¡¥ä¸ï¼Œè¿™è¡¨æ˜Žä»–å¯èƒ½ä¸æ‰“算完全拒ç»è¿™é¡¹å·¥ä½œã€‚ - -BPFè°ƒåº¦å™¨ç±»æ˜¾ç„¶æ˜¯æ ¸å¿ƒå†…æ ¸ç¤¾åŒºéš¾ä»¥æŽ¥å—çš„é‡è¦æ”¹åŠ¨ã€‚å®ƒå¢žåŠ äº†è¶…è¿‡10,000è¡Œçš„æ ¸å¿ƒä»£ç ,并公开了许多迄今为æ¢è¢«æ·±æ·±éšè—åœ¨å†…æ ¸ä¸çš„è°ƒåº¦ç»†èŠ‚ã€‚è¿™å°†æ‰¿è®¤ä¸€ä¸ªé€šç”¨è°ƒåº¦å™¨æ— æ³•æœ€ä¼˜åœ°æœåŠ¡äºŽæ‰€æœ‰å·¥ä½œè´Ÿ -载。一些人å¯èƒ½æ‹…å¿ƒè¿™å°†æ ‡å¿—ç€å®Œå…¨å…¬å¹³è°ƒåº¦å™¨çš„工作结æŸï¼Œå¹¶å¢žåŠ Linux系统的碎片化。BPF调度器的开å‘人员则æŒç›¸åçš„è§‚ç‚¹ï¼Œè®¤ä¸ºèƒ½å¤Ÿè‡ªç”±å®žéªŒè°ƒåº¦æ¨¡åž‹ï¼Œå°†åŠ é€Ÿå®Œå…¨å…¬å¹³è°ƒåº¦å™¨çš„æ”¹è¿›ã€‚ - -这个å系统的最终结果如何还很难预测,但å¯ä»¥æŒ‡å‡ºçš„æ˜¯ï¼Œè¿„今为æ¢ï¼ŒBPFå·¨å¤´å·²ç»æˆåŠŸåœ°å…‹æœäº†å‡ 乎所有é‡åˆ°çš„å对æ„è§ã€‚åœ¨å†…æ ¸ä¸é”å®šæ ¸å¿ƒåŠŸèƒ½çš„æ—¶ä»£ä¼¼ä¹Žæ£åœ¨ç»“æŸã€‚看到这个å系统将会开å¯å“ªäº›æ–°çš„ -调度方法将会是很有趣的。 diff --git a/record/dzh/huawe.md b/record/dzh/huawe.md deleted file mode 100644 index c1c847d36a95dd0c785ae757526dd92c90c62615..0000000000000000000000000000000000000000 --- a/record/dzh/huawe.md +++ /dev/null @@ -1,22 +0,0 @@ -ä½ å¥½Roman和列表æˆå‘˜ï¼Œ - -我们希望实现一个å¯ç¼–程的调度器,以满足ä¸åŒå·¥ä½œè´Ÿè½½çš„调度需求。 - -使用BPF,我们å¯ä»¥è½»æ¾åœ°ä¸ºç‰¹å®šå·¥ä½œè´Ÿè½½éƒ¨ç½²è°ƒåº¦ç–略,快速验è¯ï¼Œæ— éœ€ä¿®æ”¹å†…æ ¸ä»£ç 。这大大é™ä½Žäº†åœ¨ç”Ÿäº§çŽ¯å¢ƒä¸éƒ¨ç½²æ–°è°ƒåº¦ç–ç•¥çš„æˆæœ¬ã€‚ - -å› æ¤ï¼Œæˆ‘们希望在您的补ä¸çš„基础上继ç»å¼€å‘。我们计划将其åˆå¹¶åˆ°openeulerå¼€æºç¤¾åŒºä¸ï¼Œå¹¶åˆ©ç”¨ç¤¾åŒºä¸æ–演进和维护它。 -(链接:https://www.openeuler.org/en/) - -我们对您的补ä¸è¿›è¡Œäº†ä¸€äº›æ›´æ”¹ï¼š -1.适应openeuler-OLK-5.10分支,该分支大多基于长期支æŒçš„Linux分支5.10。 -2.引入Kconfig CONFIG_BPF_SCHED以在编译时隔离相关代ç 。 -3.修改了helpers bpf_sched_entity_to_cgrpid()å’Œbpf_sched_entity_belongs_to_cgrp(),通过se->my_q->tg->css.cgroup获å–调度实体所属的任务组。 - -我们有一些关于下一次Scheduler BPFè¿ä»£çš„æƒ³æ³•,想与您分享: -1.在struct task_structå’Œstruct task_group䏿·»åŠ tagå—æ®µã€‚用户å¯ä»¥ä½¿ç”¨æ–‡ä»¶ç³»ç»ŸæŽ¥å£ä¸ºç‰¹å®šå·¥ä½œè´Ÿè½½æ ‡è®°ä¸åŒçš„æ ‡ç¾ã€‚bpf progèŽ·å–æ ‡ç¾ä»¥æ£€æµ‹ä¸åŒçš„工作负载。 -2.æ·»åŠ BPF hookå’Œhelperæ¥è°ƒåº¦è¿›ç¨‹ï¼Œå¦‚select_task_rqå’Œpick_next_taskï¼Œä»¥å®žçŽ°å¯æ‰©å±•性。 - -这是一个新的å°è¯•,åŽé¢è‚¯å®šä¼šæœ‰å¾ˆå¤šé—®é¢˜ï¼Œä½†è®©è°ƒåº¦å™¨å¯ç¼–程是令人兴奋的。 - -ç¥å¥½ï¼Œ -ä»»å¿—æ° diff --git a/sched/Makefile b/sched/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..fa9430ec0851787f5e645b650a0ac04d21dff2f1 --- /dev/null +++ b/sched/Makefile @@ -0,0 +1,200 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. +include ../../build/Build.include +include ../../scripts/Makefile.arch +include ../../scripts/Makefile.include + +ifneq ($(LLVM),) +ifneq ($(filter %/,$(LLVM)),) +LLVM_PREFIX := $(LLVM) +else ifneq ($(filter -%,$(LLVM)),) +LLVM_SUFFIX := $(LLVM) +endif + +CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi +CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu +CLANG_TARGET_FLAGS_hexagon := hexagon-linux-musl +CLANG_TARGET_FLAGS_m68k := m68k-linux-gnu +CLANG_TARGET_FLAGS_mips := mipsel-linux-gnu +CLANG_TARGET_FLAGS_powerpc := powerpc64le-linux-gnu +CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu +CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu +CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu +CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH)) + +ifeq ($(CROSS_COMPILE),) +ifeq ($(CLANG_TARGET_FLAGS),) +$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk +else +CLANG_FLAGS += --target=$(CLANG_TARGET_FLAGS) +endif # CLANG_TARGET_FLAGS +else +CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%)) +endif # CROSS_COMPILE + +CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as +else +CC := $(CROSS_COMPILE)g++ +endif # LLVM + +CURDIR := $(abspath .) +TOOLSDIR := $(abspath ../..) +LIBDIR := $(TOOLSDIR)/lib +COS_LIBDIR := $(CURDIR)/../lib +BPFDIR := $(LIBDIR)/bpf +TOOLSINCDIR := $(TOOLSDIR)/include +BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool +APIDIR := $(TOOLSINCDIR)/uapi +GENDIR := $(abspath ../../../include/generated) +GENHDR := $(GENDIR)/autoconf.h +LLVM_INCLUDE := /home/shootfirst/llvm-project/build/lib/clang/17/include/ + +SCRATCH_DIR := $(CURDIR)/tools +BUILD_DIR := $(SCRATCH_DIR)/build +INCLUDE_DIR := $(SCRATCH_DIR)/include +BPFOBJ_DIR := $(BUILD_DIR)/libbpf +BPFOBJ := $(BPFOBJ_DIR)/libbpf.a +ifneq ($(CROSS_COMPILE),) +HOST_BUILD_DIR := $(BUILD_DIR)/host +HOST_SCRATCH_DIR := host-tools +HOST_INCLUDE_DIR := $(HOST_SCRATCH_DIR)/include +else +HOST_BUILD_DIR := $(BUILD_DIR) +HOST_SCRATCH_DIR := $(SCRATCH_DIR) +HOST_INCLUDE_DIR := $(INCLUDE_DIR) +endif +HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a +RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids +DEFAULT_BPFTOOL := $(HOST_SCRATCH_DIR)/sbin/bpftool + +VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ + $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ + ../../../vmlinux \ + /sys/kernel/btf/vmlinux \ + /boot/vmlinux-$(shell uname -r) +VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) +ifeq ($(VMLINUX_BTF),) +$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") +endif + +BPFTOOL ?= $(DEFAULT_BPFTOOL) + +ifneq ($(wildcard $(GENHDR)),) + GENFLAGS := -DHAVE_GENHDR +endif + +CFLAGS += -g -O2 -rdynamic -pthread -std=c++11 -Wall -Werror $(GENFLAGS) \ + -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ + -I$(TOOLSINCDIR) -I$(APIDIR) -I$(COS_LIBDIR) + +CARGOFLAGS := --release + +# Silence some warnings when compiled with clang +ifneq ($(LLVM),) +CFLAGS += -Wno-unused-command-line-argument +endif + +LDFLAGS = -lelf -lz -lpthread + +IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null | \ + grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__') + +# Get Clang's default includes on this system, as opposed to those seen by +# '-target bpf'. This fixes "missing" files on some architectures/distros, +# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +define get_sys_includes +$(shell $(1) -v -E - </dev/null 2>&1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ +$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}') +endef + +BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \ + $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian) \ + -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) -I$(LLVM_INCLUDE) -I$(COS_LIBDIR) \ + -I../../../include \ + $(call get_sys_includes,$(CLANG)) \ + -Wall -Wno-compare-distinct-pointer-types \ + -O2 -mcpu=v3 + +all: scx_sjf scx_mfq + +# sort removes libbpf duplicates when not cross-building +MAKE_DIRS := $(sort $(BUILD_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf \ + $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids \ + $(INCLUDE_DIR)) + +$(MAKE_DIRS): + $(call msg,MKDIR,,$@) + $(Q)mkdir -p $@ + +$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ + $(APIDIR)/linux/bpf.h \ + | $(BUILD_DIR)/libbpf + $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \ + EXTRA_CFLAGS='-g -O0 -fPIC' \ + DESTDIR=$(SCRATCH_DIR) prefix= all install_headers + +$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ + $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool + $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ + ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD) \ + EXTRA_CFLAGS='-g -O0' \ + OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ + LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/ \ + LIBBPF_DESTDIR=$(HOST_SCRATCH_DIR)/ \ + prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install-bin + +$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR) +ifeq ($(VMLINUX_H),) + $(call msg,GEN,,$@) + $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ +else + $(call msg,CP,,$@) + $(Q)cp "$(VMLINUX_H)" $@ +endif + +%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h scx_common.bpf.h user_exit_info.h \ + | $(BPFOBJ) + $(call msg,CLNG-BPF,,$@) + $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ + +%.skel.h: %.bpf.o $(BPFTOOL) + $(call msg,GEN-SKEL,,$@) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< + $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) + $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) + $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) + $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(notdir $(<:.bpf.o=)) > $@ + $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(notdir $(<:.bpf.o=)) > $(@:.skel.h=.subskel.h) + +scx_sjf: $(CURDIR)/sjf/scx_sjf.c $(CURDIR)/sjf/scx_sjf.skel.h \ + $(CURDIR)/sjf/scx_sjf_common.h user_exit_info.h + $(CC) $(CFLAGS) -c $< -o $@.o + $(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS) + +scx_mfq: $(CURDIR)/mfq/scx_mfq.c $(CURDIR)/mfq/scx_mfq.skel.h \ + $(CURDIR)/mfq/scx_mfq_common.h user_exit_info.h + $(CC) $(CFLAGS) -c $< -o $@.o + $(CC) -o $@ $@.o $(HOST_BPFOBJ) $(LDFLAGS) + + +clean: + rm -rf $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) + rm -f sjf/*.o sjf/*.bpf.o sjf/*.skel.h sjf/*.subskel.h *.o + rm -f scx_sjf + rm -f mfq/*.o mfq/*.bpf.o mfq/*.skel.h mfq/*.subskel.h *.o + rm -f scx_mfq + +install: + sudo mkdir -p /etc/cos/shm + +.PHONY: all clean install + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.skel.h, .bpf.o, etc) targets +.SECONDARY: diff --git a/sched/mfq/mfq_sched.h b/sched/mfq/mfq_sched.h new file mode 100644 index 0000000000000000000000000000000000000000..3bccac920c23a91d52deac9d690e7af0b2ef228d --- /dev/null +++ b/sched/mfq/mfq_sched.h @@ -0,0 +1,168 @@ +#include <vector> +#include <queue> +#include <cassert> +#include <iostream> +#include <chrono> + +#define MS2NS 1 * 1000 * 1000 + + +/* The data structure containing tasks that are enqueued in user space. */ +struct enqueued_task { + __u64 sum_exec_runtime; + __s32 queue_id; +}; + +std::time_t getTimeStamp() +{ + std::chrono::time_point<std::chrono::system_clock,std::chrono::nanoseconds> tp = std::chrono::time_point_cast<std::chrono::milliseconds>(std::chrono::system_clock::now());//获å–当剿—¶é—´ç‚¹ + std::time_t timestamp = tp.time_since_epoch().count(); + return timestamp; +} + + +class MFQSched { +private: + // number of queue + static const size_t queue_num_ = 5; + + // per queue size + const std::vector<size_t> queue_sizes_ = {1000, 2000, 4000, 6000, 10000}; + + // per queue thread max running time, the last queue running time is infinite + const std::vector<__u64> max_time_ = {1 * MS2NS, 3 * MS2NS, 10 * MS2NS, 20 * MS2NS, 0}; + + // maximum interval time per queue is not accessed + const std::vector<__u64> max_gap_ = {0, 6 * MS2NS, 20 * MS2NS, 40 * MS2NS, 100 * MS2NS}; + + // queues to store thread info + std::vector<std::queue<enqueued_task*>> mfq_; + + // the latest time per queue being accessed + std::vector<__u64> latest_sched_time_; + + // number of thread to pop per sched time + size_t total_sched_num_; + + // number of thread to pop per sched time for normal sched + size_t normal_sched_num_; + + +public: + + MFQSched() {} + + MFQSched(size_t total_sched_num) : total_sched_num_(total_sched_num) { + + normal_sched_num_ = total_sched_num - total_sched_num / 4; + + __u64 now = (__u64)getTimeStamp(); + printf("MFQ init! now is %llu\n", now); + + latest_sched_time_ = std::vector<__u64>(queue_num_); + for (size_t i = 0; i < queue_num_; i++) { + latest_sched_time_[i] = now; + } + + mfq_ = std::vector<std::queue<enqueued_task*>>(queue_num_); + for (size_t i = 0; i < queue_num_; i++) { + mfq_[i] = std::queue<enqueued_task*>(); + } + + } + + bool Enqueue(enqueued_task* task) { + assert((size_t)task->queue_id < queue_num_); + + // decide which queue to push + size_t queue_id = 0; + for (; queue_id < queue_num_ - 1; queue_id++) { + // printf("task->sum_exec_runtime %llu, queue %lu max_time %llu\n", task->sum_exec_runtime, queue_id, max_time_[queue_id]); + if (task->sum_exec_runtime < max_time_[queue_id]) { + break; + } + } + task->queue_id = queue_id; + + // target queue is full + if (mfq_[task->queue_id].size() == queue_sizes_[task->queue_id]) { + return false; + } + + // push the thread to target queue + mfq_[task->queue_id].push(task); + + return true; + } + + std::vector<enqueued_task*> Schedule() { + + __u64 now = (__u64)getTimeStamp(); + + // 1. find queue to schedule + size_t chosen_queue = 0; + for (; chosen_queue < queue_num_; chosen_queue++) { + if (!mfq_[chosen_queue].empty()) { + break; + } + } + + // // 2. find max gap time queue + size_t max_gap_queue = queue_num_; + size_t max_gap_time = 0; + for (auto i = chosen_queue + 1; i < queue_num_; i++) { + if (max_gap_time < now - latest_sched_time_[i - 1] && !mfq_[i - 1].empty()) { + max_gap_time = now - latest_sched_time_[i - 1]; + max_gap_queue = i; + } + } + + // printf("chosen_queue is %lu, max_gap_queue is %lu\n",chosen_queue, max_gap_queue); + + // 3. process by case + std::vector<enqueued_task*> ans(0); + + // 3.1 all the queue is empty + if (chosen_queue >= queue_num_) { + return ans; + + // 3.2 has chosen queue, but no starvation queue + } else if (max_gap_queue >= queue_num_) { + printf("here\n"); + // a.drain out chosen queue + size_t cnt = std::min(mfq_[chosen_queue].size(), total_sched_num_); + for (size_t i = 0; i < cnt; i++) { + ans.push_back(mfq_[chosen_queue].front()); + mfq_[chosen_queue].pop(); + } + + // b. update time + latest_sched_time_[chosen_queue] = now; + + // 3.3 has both + } else if (max_gap_queue <= queue_num_) { + + // a. chosen queue + size_t cnt = std::min(mfq_[chosen_queue].size(), normal_sched_num_); + for (size_t i = 0; i < cnt; i++) { + ans.push_back(mfq_[chosen_queue].front()); + mfq_[chosen_queue].pop(); + } + + // b. starvation queue + size_t starvation_cnt = std::min(mfq_[max_gap_queue].size(), total_sched_num_ - cnt); + for (size_t i = 0; i < starvation_cnt; i++) { + ans.push_back(mfq_[max_gap_queue].front()); + mfq_[max_gap_queue].pop(); + } + + // c. update time + latest_sched_time_[chosen_queue] = now; + latest_sched_time_[max_gap_queue] = now; + } + + return ans; + + } + +}; \ No newline at end of file diff --git a/sched/mfq/scx_mfq.bpf.c b/sched/mfq/scx_mfq.bpf.c new file mode 100644 index 0000000000000000000000000000000000000000..714218dcc88f36eaa14f3c00e30d27d2a71e61cb --- /dev/null +++ b/sched/mfq/scx_mfq.bpf.c @@ -0,0 +1,278 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A minimal userland scheduler. + * + * In terms of scheduling, this provides two different types of behaviors: + * 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity. + * All such tasks are direct-dispatched from the kernel, and are never + * enqueued in user space. + * 2. A primitive vruntime scheduler that is implemented in user space, for all + * other tasks. + * + * Some parts of this example user space scheduler could be implemented more + * efficiently using more complex and sophisticated data structures. For + * example, rather than using BPF_MAP_TYPE_QUEUE's, + * BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between + * user space and kernel space. Similarly, we use a simple vruntime-sorted list + * in user space, but an rbtree could be used instead. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#include <string.h> +#include "../scx_common.bpf.h" +#include "scx_mfq_common.h" + +char _license[] SEC("license") = "GPL"; + +const volatile bool switch_partial; +const volatile s32 usersched_pid; + +/* !0 for veristat, set during init */ +const volatile u32 num_possible_cpus = 64; + +/* Stats that are printed by user space. */ +u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues; + +struct user_exit_info uei; + +/* + * Whether the user space scheduler needs to be scheduled due to a task being + * enqueued in user space. + */ +static bool usersched_needed; + +/* + * The map containing tasks that are enqueued in user space from the kernel. + * + * This map is drained by the user space scheduler. + */ +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, USERLAND_MAX_TASKS); + __type(value, struct scx_userland_enqueued_task); +} enqueued SEC(".maps"); + +/* + * The map containing tasks that are dispatched to the kernel from user space. + * + * Drained by the kernel in userland_dispatch(). + */ +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, USERLAND_MAX_TASKS); + __type(value, s32); +} dispatched SEC(".maps"); + +/* Per-task scheduling context */ +struct task_ctx { + bool force_local; /* Dispatch directly to local DSQ */ +}; + +/* Map that contains task-local storage. */ +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +static bool is_usersched_task(const struct task_struct *p) +{ + return p->pid == usersched_pid; +} + +static bool keep_in_kernel(const struct task_struct *p) +{ + return p->nr_cpus_allowed < num_possible_cpus; +} + +static struct task_struct *usersched_task(void) +{ + struct task_struct *p; + + p = bpf_task_from_pid(usersched_pid); + /* + * Should never happen -- the usersched task should always be managed + * by sched_ext. + */ + if (!p) + scx_bpf_error("Failed to find usersched task %d", usersched_pid); + + return p; +} + +s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + if (keep_in_kernel(p)) { + s32 cpu; + struct task_ctx *tctx; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("Failed to look up task-local storage for %s", p->comm); + return -ESRCH; + } + + if (p->nr_cpus_allowed == 1 || + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + tctx->force_local = true; + return prev_cpu; + } + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr); + if (cpu >= 0) { + tctx->force_local = true; + return cpu; + } + } + + return prev_cpu; +} + +static void dispatch_user_scheduler(void) +{ + struct task_struct *p; + + usersched_needed = false; + p = usersched_task(); + if (p) { + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + bpf_task_release(p); + } +} + +static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags) +{ + + struct scx_userland_enqueued_task task; + + memset(&task, 0, sizeof(task)); + task.pid = p->pid; + task.sum_exec_runtime = p->se.sum_exec_runtime; + task.weight = p->scx.weight; + // bpf_trace_printk("enqueue taggered! task->pid = %d, tgid = %d\n",p->pid,p->tgid); + if (bpf_map_push_elem(&enqueued, &task, 0)) { + /* + * If we fail to enqueue the task in user space, put it + * directly on the global DSQ. + */ + __sync_fetch_and_add(&nr_failed_enqueues, 1); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } else { + __sync_fetch_and_add(&nr_user_enqueues, 1); + usersched_needed = true; + } +} + +void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags) +{ + if (keep_in_kernel(p)) { + u64 dsq_id = SCX_DSQ_GLOBAL; + struct task_ctx *tctx; + + /* + // Per-task scheduling context + struct task_ctx { + bool force_local; /* Dispatch directly to local DSQ + }; + */ + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("Failed to lookup task ctx for %s", p->comm); + return; + } + + if (tctx->force_local) + dsq_id = SCX_DSQ_LOCAL; + tctx->force_local = false; + scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); + __sync_fetch_and_add(&nr_kernel_enqueues, 1); + return; + } else if (!is_usersched_task(p)) { + enqueue_task_in_user_space(p, enq_flags); + } +} +/* + * Called when a CPU's local dsq is empty. The operation should dispatch + * one or more tasks from the BPF scheduler into the DSQs using + * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using + * scx_bpf_consume(). +*/ +void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev) +{ + /* + * Whether the user space scheduler needs to be scheduled due to a task being + * enqueued in user space. + */ + if (usersched_needed) + dispatch_user_scheduler(); + + bpf_repeat(4096) { + s32 pid; + struct task_struct *p; + + if (bpf_map_pop_elem(&dispatched, &pid)) + break; + + /* + * The task could have exited by the time we get around to + * dispatching it. Treat this as a normal occurrence, and simply + * move onto the next iteration. + */ + p = bpf_task_from_pid(pid); + if (!p) + continue; + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + bpf_task_release(p); + } +} + +s32 BPF_STRUCT_OPS(userland_prep_enable, struct task_struct *p, + struct scx_enable_args *args) +{ + if (bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE)) + return 0; + else + return -ENOMEM; +} + +s32 BPF_STRUCT_OPS(userland_init) +{ + if (num_possible_cpus == 0) { + scx_bpf_error("User scheduler # CPUs uninitialized (%d)", + num_possible_cpus); + return -EINVAL; + } + + if (usersched_pid <= 0) { + scx_bpf_error("User scheduler pid uninitialized (%d)", + usersched_pid); + return -EINVAL; + } + + if (!switch_partial) + scx_bpf_switch_all(); + return 0; +} + +void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +SEC(".struct_ops") +struct sched_ext_ops userland_ops = { + .select_cpu = (void *)userland_select_cpu, + .enqueue = (void *)userland_enqueue, + .dispatch = (void *)userland_dispatch, + .prep_enable = (void *)userland_prep_enable, + .init = (void *)userland_init, + .exit = (void *)userland_exit, + .timeout_ms = 3000, + .name = "userland", +}; diff --git a/sched/mfq/scx_mfq.c b/sched/mfq/scx_mfq.c new file mode 100644 index 0000000000000000000000000000000000000000..814182456731fdbe2fec863032de8fabc72b7d19 --- /dev/null +++ b/sched/mfq/scx_mfq.c @@ -0,0 +1,368 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A demo sched_ext user space scheduler which provides vruntime semantics + * using a simple ordered-list implementation. + * + * Each CPU in the system resides in a single, global domain. This precludes + * the need to do any load balancing between domains. The scheduler could + * easily be extended to support multiple domains, with load balancing + * happening in user space. + * + * Any task which has any CPU affinity is scheduled entirely in BPF. This + * program only schedules tasks which may run on any CPU. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +//#define _GNU_SOURCE +#include <stdio.h> +#include <unistd.h> +#include <sched.h> +#include <signal.h> +#include <assert.h> +#include <libgen.h> +#include <pthread.h> +#include <bpf/bpf.h> +#include <sys/mman.h> +#include <sys/queue.h> +#include <sys/syscall.h> +#include <map> + +#include "../user_exit_info.h" +#include "scx_mfq_common.h" +#include "scx_mfq.skel.h" + +#include "mfq_sched.h" + +const char help_fmt[] = +"A minimal userland sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-b BATCH] [-p]\n" +"\n" +" -b BATCH The number of tasks to batch when dispatching (default: 8)\n" +" -p Don't switch all, switch only tasks on SCHED_EXT policy\n" +" -h Display this help and exit\n"; + +/* Defined in UAPI */ +#define SCHED_EXT 7 + +/* Number of tasks to batch when dispatching to user space. */ +static __u32 batch_size = 8; + +static volatile int exit_req; +static int enqueued_fd, dispatched_fd; + +static struct scx_mfq *skel; +static struct bpf_link *ops_link; + +/* Stats collected in user space. */ +static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches; + + + + +/* + * The statically allocated array of tasks. We use a statically allocated list + * here to avoid having to allocate on the enqueue path, which could cause a + * deadlock. A more substantive user space scheduler could e.g. provide a hook + * for newly enabled tasks that are passed to the scheduler from the + * .prep_enable() callback to allows the scheduler to allocate on safe paths. + */ +struct enqueued_task tasks[USERLAND_MAX_TASKS]; + +/* queues for mfq. */ +MFQSched mfq(batch_size); + +static void sigint_handler(int userland) +{ + exit_req = 1; +} + +//**************************************drain_enqueued_map******************************************* +static struct enqueued_task *get_enqueued_task(__s32 pid) +{ + if (pid >= USERLAND_MAX_TASKS) + return NULL; + + return &tasks[pid]; +} + +static int mfq_enqueue(const struct scx_userland_enqueued_task *bpf_task) +{ + struct enqueued_task *curr = nullptr; + + curr = get_enqueued_task(bpf_task->pid); + if (!curr) { + return ENOENT; + } + + curr->sum_exec_runtime = bpf_task->sum_exec_runtime; + nr_vruntime_enqueues++; + + /* Enqueue the task to the mfq queue */ + + bool ans = mfq.Enqueue(curr); + + if (!ans) { + printf("Enqueue error!\n"); + exit_req = 1; + return ENOENT; + } + + return 0; +} + +static void drain_enqueued_map(void) +{ + while (1) { + + struct scx_userland_enqueued_task task; + int err; + if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)){ + return; + } + + err = mfq_enqueue(&task); + + if (err) { + fprintf(stderr, "Failed to enqueue task %d: %s\n", + task.pid, strerror(err)); + exit_req = 1; + return; + } + + } +} +//**************************************drain_enqueued_map******************************************* + + +//**************************************dispatch_batch***************************************** +static __u32 task_pid(const struct enqueued_task *task) +{ + return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task); +} + +static int dispatch_task(s32 pid) +{ + int err; + + err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0); + if (err) { + fprintf(stderr, "Failed to dispatch task %d\n", pid); + exit_req = 1; + } else { + nr_vruntime_dispatches++; + } + + return err; +} + +static void dispatch_batch(void) +{ + auto thread_to_dispatch = mfq.Schedule(); + + if (thread_to_dispatch.size() == 0) { + return; + } + + for (auto task : thread_to_dispatch) { + __u32 pid = task_pid(task); + printf("%d schedule thread %d\n", getpid(), pid); + int err = dispatch_task(pid); + if (err) { + fprintf(stderr, "Failed to dispatch task %d\n", pid); + return; + } + } + return; + +} +//**************************************dispatch_batch***************************************** + + +static void *run_stats_printer(void *arg) +{ + while (!exit_req) { + // __u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total; + + // nr_failed_enqueues = skel->bss->nr_failed_enqueues; + // nr_kernel_enqueues = skel->bss->nr_kernel_enqueues; + // nr_user_enqueues = skel->bss->nr_user_enqueues; + // total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues; + + // printf("o-----------------------o\n"); + // printf("| BPF ENQUEUES |\n"); + // printf("|-----------------------|\n"); + // printf("| kern: %10llu |\n", nr_kernel_enqueues); + // printf("| user: %10llu |\n", nr_user_enqueues); + // printf("| failed: %10llu |\n", nr_failed_enqueues); + // printf("| -------------------- |\n"); + // printf("| total: %10llu |\n", total); + // printf("| |\n"); + // printf("|-----------------------|\n"); + // printf("| VRUNTIME / USER |\n"); + // printf("|-----------------------|\n"); + // printf("| enq: %10llu |\n", nr_vruntime_enqueues); + // printf("| disp: %10llu |\n", nr_vruntime_dispatches); + // printf("o-----------------------o\n"); + // printf("%d\n",getpid()); + // printf("\n\n"); + + // printf("o-----------------------o\n"); + // printf("| BPF ENQUEUES |\n"); + // printf("|-----------------------|\n"); + // printf("| kern: %10llu |\n", nr_kernel_enqueues); + // printf("| user: %10llu |\n", nr_user_enqueues); + // printf("| failed: %10llu |\n", nr_failed_enqueues); + // printf("| -------------------- |\n"); + // printf("| total: %10llu |\n", total); + // printf("| |\n"); + // printf("|-----------------------|\n"); + // printf("| VRUNTIME / USER |\n"); + // printf("|-----------------------|\n"); + // printf("| enq: %10llu |\n", nr_vruntime_enqueues); + // printf("| disp: %10llu |\n", nr_vruntime_dispatches); + // printf("o-----------------------o\n"); + // printf("\n\n"); + sleep(1); + } + + return NULL; +} + +static int spawn_stats_thread(void) +{ + pthread_t stats_printer; + + return pthread_create(&stats_printer, NULL, run_stats_printer, NULL); +} + +static int bootstrap(int argc, char **argv) +{ + int err; + //__u32 opt; + int opt; + struct sched_param sched_param = { + .sched_priority = sched_get_priority_max(SCHED_EXT), + }; + bool switch_partial = false; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + /* + * Enforce that the user scheduler task is managed by sched_ext. The + * task eagerly drains the list of enqueued tasks in its main work + * loop, and then yields the CPU. The BPF scheduler only schedules the + * user space scheduler task when at least one other task in the system + * needs to be scheduled. + */ + err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param); + if (err) { + fprintf(stderr, "Failed to set scheduler to SCHED_EXT: %s\n", strerror(err)); + return err; + } + + while ((opt = getopt(argc, argv, "b:ph")) != -1) { + switch (opt) { + case 'b': + batch_size = strtoul(optarg, NULL, 0); + break; + case 'p': + switch_partial = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + exit(opt != 'h'); + } + } + + + /* + * It's not always safe to allocate in a user space scheduler, as an + * enqueued task could hold a lock that we require in order to be able + * to allocate. + */ + err = mlockall(MCL_CURRENT | MCL_FUTURE); + if (err) { + fprintf(stderr, "Failed to prefault and lock address space: %s\n", + strerror(err)); + return err; + } + + skel = scx_mfq__open(); + if (!skel) { + fprintf(stderr, "Failed to open scheduler: %s\n", strerror(errno)); + return errno; + } + skel->rodata->num_possible_cpus = libbpf_num_possible_cpus(); + assert(skel->rodata->num_possible_cpus > 0); + skel->rodata->usersched_pid = getpid(); + assert(skel->rodata->usersched_pid > 0); + skel->rodata->switch_partial = switch_partial; + + err = scx_mfq__load(skel); + if (err) { + fprintf(stderr, "Failed to load scheduler: %s\n", strerror(err)); + goto destroy_skel; + } + + enqueued_fd = bpf_map__fd(skel->maps.enqueued); + dispatched_fd = bpf_map__fd(skel->maps.dispatched); + assert(enqueued_fd > 0); + assert(dispatched_fd > 0); + + err = spawn_stats_thread();// 生出一个打å°çº¿ç¨‹ + if (err) { + fprintf(stderr, "Failed to spawn stats thread: %s\n", strerror(err)); + goto destroy_skel; + } + + ops_link = bpf_map__attach_struct_ops(skel->maps.userland_ops); + if (!ops_link) { + fprintf(stderr, "Failed to attach struct ops: %s\n", strerror(errno)); + err = errno; + goto destroy_skel; + } + + return 0; + +destroy_skel: + scx_mfq__destroy(skel); + exit_req = 1; + return err; +} + +static void sched_main_loop(void) +{ + while (!exit_req) { + drain_enqueued_map(); + dispatch_batch(); + sched_yield(); + } +} + +int main(int argc, char **argv) +{ + int err; + + // init mfq + // mfq = MFQSched((size_t)batch_size); + err = bootstrap(argc, argv); + if (err) { + fprintf(stderr, "Failed to bootstrap scheduler: %s\n", strerror(err)); + return err; + } + + sched_main_loop(); + + exit_req = 1; + bpf_link__destroy(ops_link); + uei_print(&skel->bss->uei); + scx_mfq__destroy(skel); + return 0; +} diff --git a/sched/mfq/scx_mfq_common.h b/sched/mfq/scx_mfq_common.h new file mode 100644 index 0000000000000000000000000000000000000000..8382bbc0b983e0d576c8dc8e86c4ae497684f014 --- /dev/null +++ b/sched/mfq/scx_mfq_common.h @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta, Inc */ + +#ifndef __SCX_USERLAND_COMMON_H +#define __SCX_USERLAND_COMMON_H + +#define USERLAND_MAX_TASKS 60000 + +#define MFQ_QUEUE_NO1_SIZE 1000 +#define MFQ_QUEUE_NO2_SIZE 2000 +#define MFQ_QUEUE_NO3_SIZE 3000 +#define MFQ_QUEUE_NO4_SIZE 4000 +#define MFQ_QUEUE_NO5_SIZE 5000 + + +#include "../../lib/cos_client.h" +#include "../../lib/cos.h" + +/* + * An instance of a task that has been enqueued by the kernel for consumption + * by a user space global scheduler thread. + * task的一个实例,它å¯ä»¥è¢«kernel enqueue,用于被userspaceçš„global scheduler消费 + */ +struct scx_userland_enqueued_task { + __s32 pid; + u64 sum_exec_runtime; + u64 weight; + __s32 queue_id; +}; + +#endif // __SCX_USERLAND_COMMON_H diff --git a/sched/scx_common.bpf.h b/sched/scx_common.bpf.h new file mode 100644 index 0000000000000000000000000000000000000000..e56de9dc86f288caf1e9fbdd109cb0a09991fbc1 --- /dev/null +++ b/sched/scx_common.bpf.h @@ -0,0 +1,288 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#ifndef __SCHED_EXT_COMMON_BPF_H +#define __SCHED_EXT_COMMON_BPF_H + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <linux/errno.h> +#include "user_exit_info.h" + +#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ +#define PF_EXITING 0x00000004 +#define CLOCK_MONOTONIC 1 + +/* + * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can + * lead to really confusing misbehaviors. Let's trigger a build failure. + */ +static inline void ___vmlinux_h_sanity_check___(void) +{ + _Static_assert(SCX_DSQ_FLAG_BUILTIN, + "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole"); +} + +void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; + +static inline __attribute__((format(printf, 1, 2))) +void ___scx_bpf_error_format_checker(const char *fmt, ...) {} + +/* + * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments + * instead of an array of u64. Note that __param[] must have at least one + * element to keep the verifier happy. + */ +#define scx_bpf_error(fmt, args...) \ +({ \ + static char ___fmt[] = fmt; \ + unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \ + \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ + ___bpf_fill(___param, args); \ + _Pragma("GCC diagnostic pop") \ + \ + scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ + \ + ___scx_bpf_error_format_checker(fmt, ##args); \ +}) + +void scx_bpf_switch_all(void) __ksym; +s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; +bool scx_bpf_consume(u64 dsq_id) __ksym; +u32 scx_bpf_dispatch_nr_slots(void) __ksym; +void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym; +void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym; +void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; +s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; +bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; +s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed) __ksym; +const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; +const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; +void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; +void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; +bool scx_bpf_task_running(const struct task_struct *p) __ksym; +s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; +struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; +u32 scx_bpf_reenqueue_local(void) __ksym; + +#define BPF_STRUCT_OPS(name, args...) \ +SEC("struct_ops/"#name) \ +BPF_PROG(name, ##args) + +#define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ +SEC("struct_ops.s/"#name) \ +BPF_PROG(name, ##args) + +/** + * MEMBER_VPTR - Obtain the verified pointer to a struct or array member + * @base: struct or array to index + * @member: dereferenced member (e.g. ->field, [idx0][idx1], ...) + * + * The verifier often gets confused by the instruction sequence the compiler + * generates for indexing struct fields or arrays. This macro forces the + * compiler to generate a code sequence which first calculates the byte offset, + * checks it against the struct or array size and add that byte offset to + * generate the pointer to the member to help the verifier. + * + * Ideally, we want to abort if the calculated offset is out-of-bounds. However, + * BPF currently doesn't support abort, so evaluate to NULL instead. The caller + * must check for NULL and take appropriate action to appease the verifier. To + * avoid confusing the verifier, it's best to check for NULL and dereference + * immediately. + * + * vptr = MEMBER_VPTR(my_array, [i][j]); + * if (!vptr) + * return error; + * *vptr = new_value; + */ +#define MEMBER_VPTR(base, member) (typeof(base member) *)({ \ + u64 __base = (u64)base; \ + u64 __addr = (u64)&(base member) - __base; \ + asm volatile ( \ + "if %0 <= %[max] goto +2\n" \ + "%0 = 0\n" \ + "goto +1\n" \ + "%0 += %1\n" \ + : "+r"(__addr) \ + : "r"(__base), \ + [max]"i"(sizeof(base) - sizeof(base member))); \ + __addr; \ +}) + +/* + * BPF core and other generic helpers + */ + +/* list and rbtree */ +#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node))) +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) + +void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym; +void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; + +#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) +#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) + +void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; +struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym; +struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym; +struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, + struct bpf_rb_node *node) __ksym; +void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, + bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) __ksym; +struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; + +/* task */ +struct task_struct *bpf_task_from_pid(s32 pid) __ksym; +struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; +void bpf_task_release(struct task_struct *p) __ksym; + +/* cgroup */ +struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym; +void bpf_cgroup_release(struct cgroup *cgrp) __ksym; +struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; + +/* cpumask */ +struct bpf_cpumask *bpf_cpumask_create(void) __ksym; +struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym; +u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym; +u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym; +void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym; +bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym; +void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym; +bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1, + const struct cpumask *src2) __ksym; +bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym; +bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym; +bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym; +void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym; +u32 bpf_cpumask_any(const struct cpumask *cpumask) __ksym; +u32 bpf_cpumask_any_and(const struct cpumask *src1, const struct cpumask *src2) __ksym; + +/* rcu */ +void bpf_rcu_read_lock(void) __ksym; +void bpf_rcu_read_unlock(void) __ksym; + +/* BPF core iterators from tools/testing/selftests/bpf/progs/bpf_misc.h */ +struct bpf_iter_num; + +extern int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) __ksym; +extern int *bpf_iter_num_next(struct bpf_iter_num *it) __ksym; +extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __ksym; + +#ifndef bpf_for_each +/* bpf_for_each(iter_type, cur_elem, args...) provides generic construct for + * using BPF open-coded iterators without having to write mundane explicit + * low-level loop logic. Instead, it provides for()-like generic construct + * that can be used pretty naturally. E.g., for some hypothetical cgroup + * iterator, you'd write: + * + * struct cgroup *cg, *parent_cg = <...>; + * + * bpf_for_each(cgroup, cg, parent_cg, CG_ITER_CHILDREN) { + * bpf_printk("Child cgroup id = %d", cg->cgroup_id); + * if (cg->cgroup_id == 123) + * break; + * } + * + * I.e., it looks almost like high-level for each loop in other languages, + * supports continue/break, and is verifiable by BPF verifier. + * + * For iterating integers, the difference betwen bpf_for_each(num, i, N, M) + * and bpf_for(i, N, M) is in that bpf_for() provides additional proof to + * verifier that i is in [N, M) range, and in bpf_for_each() case i is `int + * *`, not just `int`. So for integers bpf_for() is more convenient. + * + * Note: this macro relies on C99 feature of allowing to declare variables + * inside for() loop, bound to for() loop lifetime. It also utilizes GCC + * extension: __attribute__((cleanup(<func>))), supported by both GCC and + * Clang. + */ +#define bpf_for_each(type, cur, args...) for ( \ + /* initialize and define destructor */ \ + struct bpf_iter_##type ___it __attribute__((aligned(8), /* enforce, just in case */, \ + cleanup(bpf_iter_##type##_destroy))), \ + /* ___p pointer is just to call bpf_iter_##type##_new() *once* to init ___it */ \ + *___p __attribute__((unused)) = ( \ + bpf_iter_##type##_new(&___it, ##args), \ + /* this is a workaround for Clang bug: it currently doesn't emit BTF */ \ + /* for bpf_iter_##type##_destroy() when used from cleanup() attribute */ \ + (void)bpf_iter_##type##_destroy, (void *)0); \ + /* iteration and termination check */ \ + (((cur) = bpf_iter_##type##_next(&___it))); \ +) +#endif /* bpf_for_each */ + +#ifndef bpf_for +/* bpf_for(i, start, end) implements a for()-like looping construct that sets + * provided integer variable *i* to values starting from *start* through, + * but not including, *end*. It also proves to BPF verifier that *i* belongs + * to range [start, end), so this can be used for accessing arrays without + * extra checks. + * + * Note: *start* and *end* are assumed to be expressions with no side effects + * and whose values do not change throughout bpf_for() loop execution. They do + * not have to be statically known or constant, though. + * + * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for() + * loop bound variables and cleanup attribute, supported by GCC and Clang. + */ +#define bpf_for(i, start, end) for ( \ + /* initialize and define destructor */ \ + struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */ \ + cleanup(bpf_iter_num_destroy))), \ + /* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */ \ + *___p __attribute__((unused)) = ( \ + bpf_iter_num_new(&___it, (start), (end)), \ + /* this is a workaround for Clang bug: it currently doesn't emit BTF */ \ + /* for bpf_iter_num_destroy() when used from cleanup() attribute */ \ + (void)bpf_iter_num_destroy, (void *)0); \ + ({ \ + /* iteration step */ \ + int *___t = bpf_iter_num_next(&___it); \ + /* termination and bounds check */ \ + (___t && ((i) = *___t, (i) >= (start) && (i) < (end))); \ + }); \ +) +#endif /* bpf_for */ + +#ifndef bpf_repeat +/* bpf_repeat(N) performs N iterations without exposing iteration number + * + * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for() + * loop bound variables and cleanup attribute, supported by GCC and Clang. + */ +#define bpf_repeat(N) for ( \ + /* initialize and define destructor */ \ + struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */ \ + cleanup(bpf_iter_num_destroy))), \ + /* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */ \ + *___p __attribute__((unused)) = ( \ + bpf_iter_num_new(&___it, 0, (N)), \ + /* this is a workaround for Clang bug: it currently doesn't emit BTF */ \ + /* for bpf_iter_num_destroy() when used from cleanup() attribute */ \ + (void)bpf_iter_num_destroy, (void *)0); \ + bpf_iter_num_next(&___it); \ + /* nothing here */ \ +) +#endif /* bpf_repeat */ + +#endif /* __SCHED_EXT_COMMON_BPF_H */ diff --git a/sched/sjf/scx_sjf.bpf.c b/sched/sjf/scx_sjf.bpf.c new file mode 100644 index 0000000000000000000000000000000000000000..7f5b6afee49d08f65f038bca8639ba94dbc33013 --- /dev/null +++ b/sched/sjf/scx_sjf.bpf.c @@ -0,0 +1,279 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A minimal userland scheduler. + * + * In terms of scheduling, this provides two different types of behaviors: + * 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity. + * All such tasks are direct-dispatched from the kernel, and are never + * enqueued in user space. + * 2. A primitive vruntime scheduler that is implemented in user space, for all + * other tasks. + * + * Some parts of this example user space scheduler could be implemented more + * efficiently using more complex and sophisticated data structures. For + * example, rather than using BPF_MAP_TYPE_QUEUE's, + * BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between + * user space and kernel space. Similarly, we use a simple vruntime-sorted list + * in user space, but an rbtree could be used instead. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#include <string.h> +#include "../scx_common.bpf.h" +#include "scx_sjf_common.h" + +char _license[] SEC("license") = "GPL"; + +const volatile bool switch_partial; +const volatile s32 usersched_pid; + +/* !0 for veristat, set during init */ +const volatile u32 num_possible_cpus = 64; + +/* Stats that are printed by user space. */ +u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues; + +struct user_exit_info uei; + +/* + * Whether the user space scheduler needs to be scheduled due to a task being + * enqueued in user space. + */ +static bool usersched_needed; + +/* + * The map containing tasks that are enqueued in user space from the kernel. + * + * This map is drained by the user space scheduler. + */ +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, USERLAND_MAX_TASKS); + __type(value, struct scx_userland_enqueued_task); +} enqueued SEC(".maps"); + +/* + * The map containing tasks that are dispatched to the kernel from user space. + * + * Drained by the kernel in userland_dispatch(). + */ +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, USERLAND_MAX_TASKS); + __type(value, s32); +} dispatched SEC(".maps"); + +/* Per-task scheduling context */ +struct task_ctx { + bool force_local; /* Dispatch directly to local DSQ */ +}; + +/* Map that contains task-local storage. */ +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +static bool is_usersched_task(const struct task_struct *p) +{ + return p->pid == usersched_pid; +} + +static bool keep_in_kernel(const struct task_struct *p) +{ + return p->nr_cpus_allowed < num_possible_cpus; +} + +static struct task_struct *usersched_task(void) +{ + struct task_struct *p; + + p = bpf_task_from_pid(usersched_pid); + /* + * Should never happen -- the usersched task should always be managed + * by sched_ext. + */ + if (!p) + scx_bpf_error("Failed to find usersched task %d", usersched_pid); + + return p; +} + +s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + if (keep_in_kernel(p)) { + s32 cpu; + struct task_ctx *tctx; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("Failed to look up task-local storage for %s", p->comm); + return -ESRCH; + } + + if (p->nr_cpus_allowed == 1 || + scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { + tctx->force_local = true; + return prev_cpu; + } + + cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr); + if (cpu >= 0) { + tctx->force_local = true; + return cpu; + } + } + + return prev_cpu; +} + +static void dispatch_user_scheduler(void) +{ + struct task_struct *p; + + usersched_needed = false; + p = usersched_task(); + if (p) { + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + bpf_task_release(p); + } +} + +static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags) +{ + + struct scx_userland_enqueued_task task; + + memset(&task, 0, sizeof(task)); + task.pid = p->pid; + task.sum_exec_runtime = p->se.sum_exec_runtime; + task.weight = p->scx.weight; + task.tgid = p->tgid; + // bpf_trace_printk("enqueue taggered! task->pid = %d, tgid = %d\n",p->pid,p->tgid); + if (bpf_map_push_elem(&enqueued, &task, 0)) { + /* + * If we fail to enqueue the task in user space, put it + * directly on the global DSQ. + */ + __sync_fetch_and_add(&nr_failed_enqueues, 1); + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + } else { + __sync_fetch_and_add(&nr_user_enqueues, 1); + usersched_needed = true; + } +} + +void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags) +{ + if (keep_in_kernel(p)) { + u64 dsq_id = SCX_DSQ_GLOBAL; + struct task_ctx *tctx; + + /* + // Per-task scheduling context + struct task_ctx { + bool force_local; /* Dispatch directly to local DSQ + }; + */ + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("Failed to lookup task ctx for %s", p->comm); + return; + } + + if (tctx->force_local) + dsq_id = SCX_DSQ_LOCAL; + tctx->force_local = false; + scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); + __sync_fetch_and_add(&nr_kernel_enqueues, 1); + return; + } else if (!is_usersched_task(p)) { + enqueue_task_in_user_space(p, enq_flags); + } +} +/* + * Called when a CPU's local dsq is empty. The operation should dispatch + * one or more tasks from the BPF scheduler into the DSQs using + * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using + * scx_bpf_consume(). +*/ +void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev) +{ + /* + * Whether the user space scheduler needs to be scheduled due to a task being + * enqueued in user space. + */ + if (usersched_needed) + dispatch_user_scheduler(); + + bpf_repeat(4096) { + s32 pid; + struct task_struct *p; + + if (bpf_map_pop_elem(&dispatched, &pid)) + break; + + /* + * The task could have exited by the time we get around to + * dispatching it. Treat this as a normal occurrence, and simply + * move onto the next iteration. + */ + p = bpf_task_from_pid(pid); + if (!p) + continue; + + scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + bpf_task_release(p); + } +} + +s32 BPF_STRUCT_OPS(userland_prep_enable, struct task_struct *p, + struct scx_enable_args *args) +{ + if (bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE)) + return 0; + else + return -ENOMEM; +} + +s32 BPF_STRUCT_OPS(userland_init) +{ + if (num_possible_cpus == 0) { + scx_bpf_error("User scheduler # CPUs uninitialized (%d)", + num_possible_cpus); + return -EINVAL; + } + + if (usersched_pid <= 0) { + scx_bpf_error("User scheduler pid uninitialized (%d)", + usersched_pid); + return -EINVAL; + } + + if (!switch_partial) + scx_bpf_switch_all(); + return 0; +} + +void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei) +{ + uei_record(&uei, ei); +} + +SEC(".struct_ops") +struct sched_ext_ops userland_ops = { + .select_cpu = (void *)userland_select_cpu, + .enqueue = (void *)userland_enqueue, + .dispatch = (void *)userland_dispatch, + .prep_enable = (void *)userland_prep_enable, + .init = (void *)userland_init, + .exit = (void *)userland_exit, + .timeout_ms = 3000, + .name = "userland", +}; diff --git a/sched/sjf/scx_sjf.c b/sched/sjf/scx_sjf.c new file mode 100644 index 0000000000000000000000000000000000000000..60dce55d2c0d82a3ec6a6ebd7e1368cba7dcaf10 --- /dev/null +++ b/sched/sjf/scx_sjf.c @@ -0,0 +1,451 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A demo sched_ext user space scheduler which provides vruntime semantics + * using a simple ordered-list implementation. + * + * Each CPU in the system resides in a single, global domain. This precludes + * the need to do any load balancing between domains. The scheduler could + * easily be extended to support multiple domains, with load balancing + * happening in user space. + * + * Any task which has any CPU affinity is scheduled entirely in BPF. This + * program only schedules tasks which may run on any CPU. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +//#define _GNU_SOURCE +#include <stdio.h> +#include <unistd.h> +#include <sched.h> +#include <signal.h> +#include <assert.h> +#include <libgen.h> +#include <pthread.h> +#include <bpf/bpf.h> +#include <sys/mman.h> +#include <sys/queue.h> +#include <sys/syscall.h> +#include <map> + +#include "../user_exit_info.h" +#include "scx_sjf_common.h" +#include "scx_sjf.skel.h" + +#include "hash.h" + +const char help_fmt[] = +"A minimal userland sched_ext scheduler.\n" +"\n" +"See the top-level comment in .bpf.c for more details.\n" +"\n" +"Usage: %s [-b BATCH] [-p]\n" +"\n" +" -b BATCH The number of tasks to batch when dispatching (default: 8)\n" +" -p Don't switch all, switch only tasks on SCHED_EXT policy\n" +" -h Display this help and exit\n"; + +/* Defined in UAPI */ +#define SCHED_EXT 7 + +/* Number of tasks to batch when dispatching to user space. */ +static __u32 batch_size = 8; + +static volatile int exit_req; +static int enqueued_fd, dispatched_fd; + +static struct scx_sjf *skel; +static struct bpf_link *ops_link; + +/* Stats collected in user space. */ +static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches; + +/* The data structure containing tasks that are enqueued in user space. */ +struct enqueued_task { + LIST_ENTRY(enqueued_task) entries; + __u64 sum_exec_runtime; + double vruntime; +}; + +/* + * Use a vruntime-sorted list to store tasks. This could easily be extended to + * a more optimal data structure, such as an rbtree as is done in CFS. We + * currently elect to use a sorted list to simplify the example for + * illustrative purposes. + */ +LIST_HEAD(listhead, enqueued_task); + +/* + * A vruntime-sorted list of tasks. The head of the list contains the task with + * the lowest vruntime. That is, the task that has the "highest" claim to be + * scheduled. + */ +static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head); + +/* + * The statically allocated array of tasks. We use a statically allocated list + * here to avoid having to allocate on the enqueue path, which could cause a + * deadlock. A more substantive user space scheduler could e.g. provide a hook + * for newly enabled tasks that are passed to the scheduler from the + * .prep_enable() callback to allows the scheduler to allocate on safe paths. + */ +struct enqueued_task tasks[USERLAND_MAX_TASKS]; + +static double min_vruntime; + +static void sigint_handler(int userland) +{ + exit_req = 1; +} + +static __u32 task_pid(const struct enqueued_task *task) +{ + return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task); +} + +static int dispatch_task(s32 pid) +{ + int err; + + err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0); + if (err) { + fprintf(stderr, "Failed to dispatch task %d\n", pid); + exit_req = 1; + } else { + nr_vruntime_dispatches++; + } + + return err; +} + +static struct enqueued_task *get_enqueued_task(__s32 pid) +{ + if (pid >= USERLAND_MAX_TASKS) + return NULL; + + return &tasks[pid]; +} + +static double calc_vruntime_delta(__u64 weight, __u64 delta) +{ + double weight_f = (double)weight / 100.0; + double delta_f = (double)delta; + + return delta_f / weight_f; +} + +static void update_enqueued(struct enqueued_task *enqueued, const struct scx_userland_enqueued_task *bpf_task) +{ + __u64 delta; + + delta = bpf_task->sum_exec_runtime - enqueued->sum_exec_runtime; + + enqueued->vruntime += calc_vruntime_delta(bpf_task->weight, delta); + if (min_vruntime > enqueued->vruntime) + enqueued->vruntime = min_vruntime; + enqueued->sum_exec_runtime = bpf_task->sum_exec_runtime; +} + +static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task) +{ + struct enqueued_task *curr, *enqueued, *prev; + + curr = get_enqueued_task(bpf_task->pid); + if (!curr) + return ENOENT; + + update_enqueued(curr, bpf_task); + nr_vruntime_enqueues++; + + /* + * Enqueue the task in a vruntime-sorted list. A more optimal data + * structure such as an rbtree could easily be used as well. We elect + * to use a list here simply because it's less code, and thus the + * example is less convoluted and better serves to illustrate what a + * user space scheduler could look like. + */ + + if (LIST_EMPTY(&vruntime_head)) { + LIST_INSERT_HEAD(&vruntime_head, curr, entries); + return 0; + } + + LIST_FOREACH(enqueued, &vruntime_head, entries) { + if (curr->vruntime <= enqueued->vruntime) { + LIST_INSERT_BEFORE(enqueued, curr, entries); + return 0; + } + prev = enqueued; + } + + LIST_INSERT_AFTER(prev, curr, entries); + + return 0; +} + + +#include <fcntl.h> // for O_RDWR and open + +// std::map<__s32, LFHashTable<int32_t>> tgid2hashtable; +std::map<__s32, LFHashTable<struct entry>> tgid2hashtable; + +static void drain_enqueued_map(void) +{ + while (1) { + struct scx_userland_enqueued_task task; + int err; + if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)){ + return; + } + + + /* do schedule */ + __s32 tgid = task.tgid; + + if(tgid != getpid()){ + printf("%dagent 调度 线程%d 所属进程%d\n",getpid(),task.pid, tgid); + + if (tgid2hashtable.count(tgid) == 0) { + char buf[128]; + sprintf(buf, "/etc/cos/shm/shm_%d", tgid); + int shm_fd = open(buf, O_RDWR); + void* shm = mmap(NULL, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0); + if(shm == MAP_FAILED) { // the client doesn't use share memory + goto enqueue; + } + tgid2hashtable[tgid] = LFHashTable<struct entry>(shm, SHM_SIZE, 0); + } + + struct entry tmp = tgid2hashtable[tgid].Get(task.pid); + memcpy(&(task.data), &tmp, sizeof(struct entry)); + // task.ddl = tgid2hashtable[tgid].Get(task.pid); + printf("thread %d ddl=%d\n", task.pid, task.data.ddl); + + } + + enqueue: + err = vruntime_enqueue(&task); + if (err) { + fprintf(stderr, "Failed to enqueue task %d: %s\n", + task.pid, strerror(err)); + exit_req = 1; + + return; + } + + + } +} + +// 把vruntime sorted listçš„batch个task dispatch到kernelä¸ï¼ˆï¼Ÿï¼Ÿï¼Ÿ6) +static void dispatch_batch(void) +{ + __u32 i; + + for (i = 0; i < batch_size; i++) { + /* + The data structure containing tasks that are enqueued in user space. + struct enqueued_task { + */ + struct enqueued_task *task; + int err; + __s32 pid; + + task = LIST_FIRST(&vruntime_head); + if (!task) + return; + + min_vruntime = task->vruntime; + pid = task_pid(task); + LIST_REMOVE(task, entries); + err = dispatch_task(pid); + if (err) { + fprintf(stderr, "Failed to dispatch task %d in %u\n", + pid, i); + return; + } + } +} + +static void *run_stats_printer(void *arg) +{ + while (!exit_req) { + // __u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total; + + // nr_failed_enqueues = skel->bss->nr_failed_enqueues; + // nr_kernel_enqueues = skel->bss->nr_kernel_enqueues; + // nr_user_enqueues = skel->bss->nr_user_enqueues; + // total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues; + + // printf("o-----------------------o\n"); + // printf("| BPF ENQUEUES |\n"); + // printf("|-----------------------|\n"); + // printf("| kern: %10llu |\n", nr_kernel_enqueues); + // printf("| user: %10llu |\n", nr_user_enqueues); + // printf("| failed: %10llu |\n", nr_failed_enqueues); + // printf("| -------------------- |\n"); + // printf("| total: %10llu |\n", total); + // printf("| |\n"); + // printf("|-----------------------|\n"); + // printf("| VRUNTIME / USER |\n"); + // printf("|-----------------------|\n"); + // printf("| enq: %10llu |\n", nr_vruntime_enqueues); + // printf("| disp: %10llu |\n", nr_vruntime_dispatches); + // printf("o-----------------------o\n"); + // printf("%d\n",getpid()); + // printf("\n\n"); + + // printf("o-----------------------o\n"); + // printf("| BPF ENQUEUES |\n"); + // printf("|-----------------------|\n"); + // printf("| kern: %10llu |\n", nr_kernel_enqueues); + // printf("| user: %10llu |\n", nr_user_enqueues); + // printf("| failed: %10llu |\n", nr_failed_enqueues); + // printf("| -------------------- |\n"); + // printf("| total: %10llu |\n", total); + // printf("| |\n"); + // printf("|-----------------------|\n"); + // printf("| VRUNTIME / USER |\n"); + // printf("|-----------------------|\n"); + // printf("| enq: %10llu |\n", nr_vruntime_enqueues); + // printf("| disp: %10llu |\n", nr_vruntime_dispatches); + // printf("o-----------------------o\n"); + // printf("\n\n"); + sleep(1); + } + + return NULL; +} + +static int spawn_stats_thread(void) +{ + pthread_t stats_printer; + + return pthread_create(&stats_printer, NULL, run_stats_printer, NULL); +} + +static int bootstrap(int argc, char **argv) +{ + int err; + //__u32 opt; + int opt; + struct sched_param sched_param = { + .sched_priority = sched_get_priority_max(SCHED_EXT), + }; + bool switch_partial = false; + + signal(SIGINT, sigint_handler); + signal(SIGTERM, sigint_handler); + libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + + /* + * Enforce that the user scheduler task is managed by sched_ext. The + * task eagerly drains the list of enqueued tasks in its main work + * loop, and then yields the CPU. The BPF scheduler only schedules the + * user space scheduler task when at least one other task in the system + * needs to be scheduled. + */ + err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param); + if (err) { + fprintf(stderr, "Failed to set scheduler to SCHED_EXT: %s\n", strerror(err)); + return err; + } + + while ((opt = getopt(argc, argv, "b:ph")) != -1) { + switch (opt) { + case 'b': + batch_size = strtoul(optarg, NULL, 0); + break; + case 'p': + switch_partial = true; + break; + default: + fprintf(stderr, help_fmt, basename(argv[0])); + exit(opt != 'h'); + } + } + + + /* + * It's not always safe to allocate in a user space scheduler, as an + * enqueued task could hold a lock that we require in order to be able + * to allocate. + */ + err = mlockall(MCL_CURRENT | MCL_FUTURE); + if (err) { + fprintf(stderr, "Failed to prefault and lock address space: %s\n", + strerror(err)); + return err; + } + + skel = scx_sjf__open(); + if (!skel) { + fprintf(stderr, "Failed to open scheduler: %s\n", strerror(errno)); + return errno; + } + skel->rodata->num_possible_cpus = libbpf_num_possible_cpus(); + assert(skel->rodata->num_possible_cpus > 0); + skel->rodata->usersched_pid = getpid(); + assert(skel->rodata->usersched_pid > 0); + skel->rodata->switch_partial = switch_partial; + + err = scx_sjf__load(skel); + if (err) { + fprintf(stderr, "Failed to load scheduler: %s\n", strerror(err)); + goto destroy_skel; + } + + enqueued_fd = bpf_map__fd(skel->maps.enqueued); + dispatched_fd = bpf_map__fd(skel->maps.dispatched); + assert(enqueued_fd > 0); + assert(dispatched_fd > 0); + + err = spawn_stats_thread();// 生出一个打å°çº¿ç¨‹ + if (err) { + fprintf(stderr, "Failed to spawn stats thread: %s\n", strerror(err)); + goto destroy_skel; + } + + ops_link = bpf_map__attach_struct_ops(skel->maps.userland_ops); + if (!ops_link) { + fprintf(stderr, "Failed to attach struct ops: %s\n", strerror(errno)); + err = errno; + goto destroy_skel; + } + + return 0; + +destroy_skel: + scx_sjf__destroy(skel); + exit_req = 1; + return err; +} + +static void sched_main_loop(void) +{ + while (!exit_req) { + drain_enqueued_map(); + dispatch_batch(); + sched_yield(); + } +} + +int main(int argc, char **argv) +{ + int err; + + err = bootstrap(argc, argv); + if (err) { + fprintf(stderr, "Failed to bootstrap scheduler: %s\n", strerror(err)); + return err; + } + + sched_main_loop(); + + exit_req = 1; + bpf_link__destroy(ops_link); + uei_print(&skel->bss->uei); + scx_sjf__destroy(skel); + return 0; +} diff --git a/sched/sjf/scx_sjf_common.h b/sched/sjf/scx_sjf_common.h new file mode 100644 index 0000000000000000000000000000000000000000..b614f94554955846cebee4fe8bca47f2bf1e5e7f --- /dev/null +++ b/sched/sjf/scx_sjf_common.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022 Meta, Inc */ + +#ifndef __SCX_USERLAND_COMMON_H +#define __SCX_USERLAND_COMMON_H + +#define USERLAND_MAX_TASKS 60000 + +#include "cos_client.h" +#include "cos.h" + +/* + * An instance of a task that has been enqueued by the kernel for consumption + * by a user space global scheduler thread. + * task的一个实例,它å¯ä»¥è¢«kernel enqueue,用于被userspaceçš„global scheduler消费 + */ +struct scx_userland_enqueued_task { + __s32 pid; + u64 sum_exec_runtime; + u64 weight; + + /* 新增 */ + __s32 tgid; // 代表该线程所属的进程的pid,在bpf程åºçš„enqueue_task_in_user_spaceä¸è¢«å¡«å†™ + struct entry data; +}; + +#endif // __SCX_USERLAND_COMMON_H diff --git a/sched/user_exit_info.h b/sched/user_exit_info.h new file mode 100644 index 0000000000000000000000000000000000000000..e701ef0e0b86c333207d8e315f6656cd3095195c --- /dev/null +++ b/sched/user_exit_info.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#ifndef __USER_EXIT_INFO_H +#define __USER_EXIT_INFO_H + +struct user_exit_info { + int type; + char reason[128]; + char msg[1024]; +}; + +#ifdef __bpf__ + +#include "vmlinux.h" +#include <bpf/bpf_core_read.h> + +static inline void uei_record(struct user_exit_info *uei, + const struct scx_exit_info *ei) +{ + bpf_probe_read_kernel_str(uei->reason, sizeof(uei->reason), ei->reason); + bpf_probe_read_kernel_str(uei->msg, sizeof(uei->msg), ei->msg); + /* use __sync to force memory barrier */ + __sync_val_compare_and_swap(&uei->type, uei->type, ei->type); +} + +#else /* !__bpf__ */ + +static inline bool uei_exited(struct user_exit_info *uei) +{ + /* use __sync to force memory barrier */ + return __sync_val_compare_and_swap(&uei->type, -1, -1); +} + +static inline void uei_print(const struct user_exit_info *uei) +{ + fprintf(stderr, "EXIT: %s", uei->reason); + if (uei->msg[0] != '\0') + fprintf(stderr, " (%s)", uei->msg); + fputs("\n", stderr); +} + +#endif /* __bpf__ */ +#endif /* __USER_EXIT_INFO_H */ diff --git a/tests/simple_test.cpp b/tests/simple_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9058878eb03fb47e2ec9a3fe899e4a9d298a70fd --- /dev/null +++ b/tests/simple_test.cpp @@ -0,0 +1,106 @@ +#include <sched.h> +#include <unistd.h> +#include <stdio.h> +#include <chrono> // for timing +#include <vector> +#include <assert.h> + +#include "cos_thread.h" + +#define SCHED_EXT 7 +#define SCHED_NORMAL 0 + +const sched_param param{}; + +class Timer { +public: + Timer() { startTime_ = std::chrono::high_resolution_clock::now(); } + ~Timer() { + printf("The test takes %0.1f ms\n", std::chrono::high_resolution_clock::now() - startTime_); + } +private: + std::chrono::high_resolution_clock::time_point startTime_; +}; + +void TestOne() { + printf("\nStarting one worker test...\n"); + auto t = CosThread(CosThread::KernelSchedulerType::kExt, [] { + printf("thread begin...\n"); + sleep(1); + printf("sleep over.\n"); + + std::thread t2( + [] { assert(sched_getscheduler(0) == SCHED_EXT); }); + t2.join(); + }); + + t.WaitUntilInitComplete(); + sched_setscheduler(t.tid(), SCHED_EXT, ¶m); + t.NotifyWork(); + t.Join(); + printf("\nFinish one worker test.\n"); +} + + +void TestMany(int num_workers) { + printf("\nStarting many worker test...\n"); + std::vector<std::unique_ptr<CosThread>> workers; + + for (int i = 0; i < num_workers; i++) { + workers.emplace_back(new CosThread(CosThread::KernelSchedulerType::kExt, [] { + printf("working...\n"); + sleep(1); + })); + } + + for (auto& t : workers) { + t->WaitUntilInitComplete(); + sched_setscheduler(t->tid(), SCHED_EXT, ¶m); + t->NotifyWork(); + } + for (auto& t : workers) t->Join(); + printf("\nFinish many worker test.\n"); +} + +void TestSwitchToCfs() { + printf("\nStarting switch-to-cfs test...\n"); + CosThread t = CosThread(CosThread::KernelSchedulerType::kExt, [] { + printf("thread begin...\n"); + sleep(1); + printf("sleep over.\n"); + + printf("now switch to CFS...\n"); + assert(sched_getscheduler(0) == SCHED_EXT); + + assert(sched_setscheduler(0, SCHED_NORMAL, ¶m) == 0); + assert(sched_getscheduler(0) == SCHED_NORMAL); + printf("switch to CFS successfully!\n"); + }); + + t.WaitUntilInitComplete(); + sched_setscheduler(t.tid(), SCHED_EXT, ¶m); + t.NotifyWork(); + t.Join(); + printf("\nFinish switch-to-cfs test.\n"); +} + +int main(){ + { + printf("***TestOne***\n"); + Timer t = Timer(); + TestOne(); + } + + { + printf("***TestMany***\n"); + Timer t = Timer(); + TestMany(100); + } + + { + printf("***TestSwitchToCfs***\n"); + Timer t = Timer(); + TestSwitchToCfs(); + } + return 0; +} \ No newline at end of file